1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
| import requests import random import os from concurrent.futures import ThreadPoolExecutor, as_completed import scipdf import string from datetime import datetime
class GROBID_OFFLINE_EXCEPTION(Exception): pass
class PDFToMarkdown: def __init__(self, input_path, grobid_urls=None): """ 初始化 PDFToMarkdown 实例。
Args: input_path (str): 要处理的文件或文件夹路径。 grobid_urls (list): 可选,GROBID 服务器 URLs 列表。默认为预设的 URLs 列表。 """ self.input_path = input_path self.grobid_urls = grobid_urls if grobid_urls is not None else [ "https://qingxu98-grobid.hf.space", "https://qingxu98-grobid2.hf.space", "https://qingxu98-grobid8.hf.space", ]
def get_avail_grobid_url(self): """获取可用的 GROBID 服务器 URL""" if not self.grobid_urls: return None
while self.grobid_urls: _grobid_url = random.choice(self.grobid_urls) if _grobid_url.endswith('/'): _grobid_url = _grobid_url.rstrip('/') try: res = requests.get(f"{_grobid_url}/api/isalive", timeout=5) if res.text == 'true': return _grobid_url except (requests.ConnectionError, requests.Timeout): self.grobid_urls.remove(_grobid_url) return None
@staticmethod def dict_to_markdown(article_json): """将文章字典转换为 Markdown 格式字符串""" markdown_lines = [] markdown_lines.append(f"# {article_json.get('title', '无标题')} \n") markdown_lines.append(f"> doi:{article_json.get('doi', '')} \n") markdown_lines.append(f"+ authors\n{article_json.get('authors', ['无作者'])} \n") markdown_lines.append(f"+ abstract\n{article_json.get('abstract', '无摘要')} \n")
if 'sections' in article_json: for section in article_json['sections']: markdown_lines.append(f"+ {section['heading']}\n{section['text']}\n")
return "\n".join(markdown_lines)
@staticmethod def save_markdown_file(filename, content): """将内容写入到 Markdown 文件""" with open(filename, 'w', encoding='utf-8') as f: f.write(content)
def parse_pdf(self, pdf_path, grobid_url): """解析单个 PDF 文件,返回文章字典""" if not os.path.isfile(pdf_path): raise FileNotFoundError(f"指定路径下没有找到 PDF 文件: {pdf_path}")
if grobid_url.endswith('/'): grobid_url = grobid_url.rstrip('/')
try: return scipdf.parse_pdf_to_dict(pdf_path, grobid_url=grobid_url) except GROBID_OFFLINE_EXCEPTION: raise GROBID_OFFLINE_EXCEPTION("GROBID 服务不可用,检查配置中的 GROBID_URL。") except RuntimeError: raise RuntimeError("解析 PDF 失败,请检查 PDF 是否损坏。")
def process_pdf_file(self, pdf_path, grobid_url): """处理单个 PDF 文件,返回 Markdown 内容""" print(f"正在解析: {pdf_path}") try: pdf_article_dict = self.parse_pdf(pdf_path, grobid_url) return self.dict_to_markdown(pdf_article_dict) except Exception as e: print(f"处理文件 {pdf_path} 时发生错误: {e}") return None
def process(self): """处理输入文件或文件夹,并返回生成的 Markdown 文件路径""" markdown_contents = [] grobid_url = self.get_avail_grobid_url()
if grobid_url is None: raise RuntimeError("没有可用的 GROBID 服务,请检查您的服务器配置。")
if os.path.isfile(self.input_path): pdf_files = [self.input_path] elif os.path.isdir(self.input_path): pdf_files = [os.path.join(dirpath, filename) for dirpath, _, filenames in os.walk(self.input_path) for filename in filenames if filename.endswith('.pdf')] else: raise ValueError("输入路径既不是文件也不是文件夹。")
with ThreadPoolExecutor(max_workers=5) as executor: future_to_file = {executor.submit(self.process_pdf_file, pdf, grobid_url): pdf for pdf in pdf_files}
for future in as_completed(future_to_file): result = future.result() if result: markdown_contents.append(result)
if markdown_contents: timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") random_suffix = ''.join(random.choices(string.ascii_lowercase, k=2)) output_filename = f"{timestamp}_{random_suffix}.md" self.save_markdown_file(output_filename, "\n\n".join(markdown_contents)) print(f"所有 Markdown 文件已合并并保存为 {output_filename}") return output_filename else: print("没有有效的 Markdown 内容生成。") return None
if __name__ == "__main__": input_path = 'your_file_or_directory_path' custom_grobid_urls = [ "https://your-custom-grobid-server.com", "https://another-custom-grobid-server.com", ] pdf_to_markdown = PDFToMarkdown(input_path, grobid_urls=custom_grobid_urls) output_file = pdf_to_markdown.process() print("生成的文件路径:", output_file)
|