pdf_parse_service.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. import os
  2. import json
  3. import shutil
  4. import uuid
  5. from fastapi import UploadFile, HTTPException
  6. from config.settings import PDF_SAVE_DIR, JSON_CACHE_DIR
  7. from parser.base_parser import get_file_hash
  8. from parser.ventilation_plan_parser import VentilationPlanParser
  9. def parse_pdf(file: UploadFile) -> dict:
  10. file_path = f"{PDF_SAVE_DIR}/{uuid.uuid4()}.pdf"
  11. with open(file_path, "wb") as f:
  12. f.write(file.file.read())
  13. hash_id = get_file_hash(file_path)
  14. new_pdf_path = rename_pdf_by_hash(file_path,hash_id)
  15. cache_path = f"{JSON_CACHE_DIR}/{hash_id}.json"
  16. if os.path.exists(cache_path):
  17. with open(cache_path, "r", encoding="utf-8") as f:
  18. return json.load(f)
  19. data = VentilationPlanParser().parse(new_pdf_path)
  20. with open(cache_path, "w", encoding="utf-8") as f:
  21. json.dump(data, f, ensure_ascii=False, indent=2)
  22. return data
  23. def rename_pdf_by_hash(original_pdf_path: str, hash_id: str) -> str:
  24. """
  25. 根据hash_id重命名PDF文件
  26. :param original_pdf_path: 原PDF文件路径
  27. :param hash_id: 生成的唯一hash标识
  28. :return: 新文件完整路径
  29. """
  30. # 拼接新文件名:目录 + hash_id.pdf
  31. new_file_name = f"{hash_id}.pdf"
  32. new_pdf_path = os.path.join(PDF_SAVE_DIR, new_file_name)
  33. # 执行重命名
  34. shutil.move(original_pdf_path, new_pdf_path)
  35. return new_pdf_path
  36. def get_latest_file(dir_path: str, suffix: str) -> str:
  37. """
  38. 获取目录下修改时间最新的指定后缀文件
  39. :param dir_path: 目标文件夹路径
  40. :param suffix: 文件后缀,例: .json、.pdf、.txt
  41. :return: 最新文件完整路径
  42. """
  43. # 校验目录是否存在
  44. if not os.path.isdir(dir_path):
  45. raise HTTPException(status_code=404, detail="文件夹不存在")
  46. # 筛选对应后缀文件
  47. file_list = []
  48. for file in os.listdir(dir_path):
  49. file_full_path = os.path.join(dir_path, file)
  50. if os.path.isfile(file_full_path) and file.lower().endswith(suffix.lower()):
  51. mtime = os.path.getmtime(file_full_path)
  52. file_list.append((mtime, file_full_path))
  53. if not file_list:
  54. raise HTTPException(status_code=404, detail=f"目录下未找到 {suffix} 格式文件")
  55. # 按修改时间倒序,取最新文件
  56. file_list.sort(reverse=True, key=lambda x: x[0])
  57. return file_list[0][1]