import os
import re
import docx
import requests
import json
import subprocess
import tempfile
import time
import sys
from pathlib import Path
def extract_text_from_docx(file_path):
"""从docx文件中提取文本内容"""
try:
doc = docx.Document(file_path)
text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
return text
except Exception as e:
print(f"无法使用python-docx读取文件 {file_path}: {e}")
return ""
def extract_text_using_libreoffice(file_path):
"""使用LibreOffice将doc/docx文件转换为文本并提取内容"""
try:
# 创建临时目录
with tempfile.TemporaryDirectory() as temp_dir:
temp_dir_path = Path(temp_dir)
output_file = temp_dir_path / "output.txt"
# 调用LibreOffice将文档转换为文本
# 使用--headless选项可以在无界面模式下运行LibreOffice
cmd = [
"soffice",
"--headless",
"--convert-to", "txt:Text (encoded):UTF8",
"--outdir", temp_dir,
str(file_path)
]
print(f" 正在使用LibreOffice转换 {file_path.name}...")
subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# 转换后的文件可能有不同的名称
converted_files = list(temp_dir_path.glob("*.txt"))
if not converted_files:
print(" LibreOffice转换失败:未找到输出文件")
return ""
# 读取转换后的文本文件
with open(converted_files[0], "r", encoding="utf-8") as f:
return f.read()
except subprocess.CalledProcessError as e:
print(f" LibreOffice转换失败: {e}")
return ""
except Exception as e:
print(f" 提取文本时出错: {e}")
return ""
def analyze_content_with_llm(text, api_url, api_key, model_name):
"""使用自定义LLM API分析文档内容,提取学号和姓名"""
try:
headers = {
"Content-Type": "application/json"
}
# 如果提供了API密钥,添加到请求头中
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
payload = {
"model": model_name,
"messages": [
{"role": "system",
"content": "你是一个文档分析助手。请从文本中提取学号和姓名。格式为:'学号:XXXXXXXXX,姓名:XXX'。如果找不到学号,请用'xxxxxxxxxxx'代替。"},
{"role": "user", "content": f"从以下文本中提取学号和姓名:\n\n{text[:4000]}"} # 限制文本长度
],
"temperature": 0.3 # 较低的temperature使结果更确定性
}
response = requests.post(api_url, headers=headers, json=payload)
response.raise_for_status() # 确保请求成功
response_data = response.json()
# 尝试从不同的API响应格式中提取结果
if "choices" in response_data and len(response_data["choices"]) > 0:
if "message" in response_data["choices"][0]:
result = response_data["choices"][0]["message"]["content"]
else:
result = response_data["choices"][0]["text"]
else:
result = str(response_data)
# 解析返回的结果
student_id_match = re.search(r'学号:(\w+)', result)
name_match = re.search(r'姓名:(\w+)', result)
student_id = student_id_match.group(1) if student_id_match else "xxxxxxxxxxx"
name = name_match.group(1) if name_match else "未知"
return student_id, name
except Exception as e:
print(f"API调用失败: {e}")
print(f"详细错误: {str(e)}")
return "xxxxxxxxxxx", "未知"
def rename_documents(folder_path, api_url, api_key, model_name):
"""处理文件夹中的所有doc和docx文件并重命名"""
folder = Path(folder_path)
# 获取所有doc和docx文件
doc_files = list(folder.glob("*.doc")) + list(folder.glob("*.docx"))
total_files = len(doc_files)
success_count = 0
print(f"找到 {total_files} 个文档文件需要处理")
for index, file_path in enumerate(doc_files, 1):
print(f"\n正在处理 [{index}/{total_files}]: {file_path.name}")
try:
# 尝试使用LibreOffice提取文本(适用于.doc和.docx)
text = extract_text_using_libreoffice(file_path)
# 如果LibreOffice失败且是.docx文件,尝试使用python-docx
if not text and file_path.suffix.lower() == ".docx":
print(" 尝试使用python-docx提取文本...")
text = extract_text_from_docx(file_path)
if not text:
print(" 无法提取文本,跳过此文件")
continue
# 分析内容
print(" 正在分析文档内容...")
student_id, name = analyze_content_with_llm(text, api_url, api_key, model_name)
# 创建新文件名
new_filename = f"通信数据分析实训-实验报告{student_id} {name}{file_path.suffix}"
new_file_path = file_path.parent / new_filename
# 重命名文件
try:
file_path.rename(new_file_path)
print(f" 已重命名为: {new_filename}")
success_count += 1
except Exception as e:
print(f" 重命名失败: {e}")
except Exception as e:
print(f" 处理文件时出错: {e}")
print(f"\n处理完成! 成功重命名 {success_count}/{total_files} 个文件")
def check_libreoffice():
"""检查LibreOffice是否已安装"""
try:
subprocess.run(["soffice", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return True
except FileNotFoundError:
return False
if __name__ == "__main__":
print("文档重命名工具 v1.0 (LibreOffice版)")
print("===============================")
# 检查LibreOffice
if not check_libreoffice():
print("错误: 未找到LibreOffice。请确保已安装LibreOffice并且'soffice'命令可用。")
print(" - macOS: 请确保LibreOffice已安装,并链接到PATH")
print(" - Linux: 请安装LibreOffice (sudo apt-get install libreoffice)")
print(" - Windows: 请安装LibreOffice并确保其在PATH中")
sys.exit(1)
# 检查其他Python依赖
try:
import docx
except ImportError:
print("警告: 未安装python-docx。建议安装以提供更好的.docx文件支持。")
print(" pip install python-docx")
try:
import requests
except ImportError:
print("错误: 未安装requests库,该库是必需的。")
print(" pip install requests")
sys.exit(1)
folder_path = input("\n请输入包含文档的文件夹路径: ")
api_url = input("请输入API链接 (例如: http://your-api-server/v1/chat/completions): ")
api_key = input("请输入API密钥 (如不需要可留空): ")
model_name = input("请输入模型名称 (例如: gpt-3.5-turbo): ")
print("\n开始处理文件...")
rename_documents(folder_path, api_url, api_key, model_name)