Administrator
发布于 2025-02-28 / 9 阅读 / 0 评论 / 0 点赞

批量重命名

import os
import re
import docx
import requests
import json
import subprocess
import tempfile
import time
import sys
from pathlib import Path


def extract_text_from_docx(file_path):
    """从docx文件中提取文本内容"""
    try:
        doc = docx.Document(file_path)
        text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
        return text
    except Exception as e:
        print(f"无法使用python-docx读取文件 {file_path}: {e}")
        return ""


def extract_text_using_libreoffice(file_path):
    """使用LibreOffice将doc/docx文件转换为文本并提取内容"""
    try:
        # 创建临时目录
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_dir_path = Path(temp_dir)
            output_file = temp_dir_path / "output.txt"

            # 调用LibreOffice将文档转换为文本
            # 使用--headless选项可以在无界面模式下运行LibreOffice
            cmd = [
                "soffice",
                "--headless",
                "--convert-to", "txt:Text (encoded):UTF8",
                "--outdir", temp_dir,
                str(file_path)
            ]

            print(f"  正在使用LibreOffice转换 {file_path.name}...")
            subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            # 转换后的文件可能有不同的名称
            converted_files = list(temp_dir_path.glob("*.txt"))
            if not converted_files:
                print("  LibreOffice转换失败:未找到输出文件")
                return ""

            # 读取转换后的文本文件
            with open(converted_files[0], "r", encoding="utf-8") as f:
                return f.read()

    except subprocess.CalledProcessError as e:
        print(f"  LibreOffice转换失败: {e}")
        return ""
    except Exception as e:
        print(f"  提取文本时出错: {e}")
        return ""


def analyze_content_with_llm(text, api_url, api_key, model_name):
    """使用自定义LLM API分析文档内容,提取学号和姓名"""
    try:
        headers = {
            "Content-Type": "application/json"
        }

        # 如果提供了API密钥,添加到请求头中
        if api_key:
            headers["Authorization"] = f"Bearer {api_key}"

        payload = {
            "model": model_name,
            "messages": [
                {"role": "system",
                 "content": "你是一个文档分析助手。请从文本中提取学号和姓名。格式为:'学号:XXXXXXXXX,姓名:XXX'。如果找不到学号,请用'xxxxxxxxxxx'代替。"},
                {"role": "user", "content": f"从以下文本中提取学号和姓名:\n\n{text[:4000]}"}  # 限制文本长度
            ],
            "temperature": 0.3  # 较低的temperature使结果更确定性
        }

        response = requests.post(api_url, headers=headers, json=payload)
        response.raise_for_status()  # 确保请求成功

        response_data = response.json()

        # 尝试从不同的API响应格式中提取结果
        if "choices" in response_data and len(response_data["choices"]) > 0:
            if "message" in response_data["choices"][0]:
                result = response_data["choices"][0]["message"]["content"]
            else:
                result = response_data["choices"][0]["text"]
        else:
            result = str(response_data)

        # 解析返回的结果
        student_id_match = re.search(r'学号:(\w+)', result)
        name_match = re.search(r'姓名:(\w+)', result)

        student_id = student_id_match.group(1) if student_id_match else "xxxxxxxxxxx"
        name = name_match.group(1) if name_match else "未知"

        return student_id, name
    except Exception as e:
        print(f"API调用失败: {e}")
        print(f"详细错误: {str(e)}")
        return "xxxxxxxxxxx", "未知"


def rename_documents(folder_path, api_url, api_key, model_name):
    """处理文件夹中的所有doc和docx文件并重命名"""
    folder = Path(folder_path)

    # 获取所有doc和docx文件
    doc_files = list(folder.glob("*.doc")) + list(folder.glob("*.docx"))

    total_files = len(doc_files)
    success_count = 0

    print(f"找到 {total_files} 个文档文件需要处理")

    for index, file_path in enumerate(doc_files, 1):
        print(f"\n正在处理 [{index}/{total_files}]: {file_path.name}")

        try:
            # 尝试使用LibreOffice提取文本(适用于.doc和.docx)
            text = extract_text_using_libreoffice(file_path)

            # 如果LibreOffice失败且是.docx文件,尝试使用python-docx
            if not text and file_path.suffix.lower() == ".docx":
                print("  尝试使用python-docx提取文本...")
                text = extract_text_from_docx(file_path)

            if not text:
                print("  无法提取文本,跳过此文件")
                continue

            # 分析内容
            print("  正在分析文档内容...")
            student_id, name = analyze_content_with_llm(text, api_url, api_key, model_name)

            # 创建新文件名
            new_filename = f"通信数据分析实训-实验报告{student_id} {name}{file_path.suffix}"
            new_file_path = file_path.parent / new_filename

            # 重命名文件
            try:
                file_path.rename(new_file_path)
                print(f"  已重命名为: {new_filename}")
                success_count += 1
            except Exception as e:
                print(f"  重命名失败: {e}")
        except Exception as e:
            print(f"  处理文件时出错: {e}")

    print(f"\n处理完成! 成功重命名 {success_count}/{total_files} 个文件")


def check_libreoffice():
    """检查LibreOffice是否已安装"""
    try:
        subprocess.run(["soffice", "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return True
    except FileNotFoundError:
        return False


if __name__ == "__main__":
    print("文档重命名工具 v1.0 (LibreOffice版)")
    print("===============================")

    # 检查LibreOffice
    if not check_libreoffice():
        print("错误: 未找到LibreOffice。请确保已安装LibreOffice并且'soffice'命令可用。")
        print("  - macOS: 请确保LibreOffice已安装,并链接到PATH")
        print("  - Linux: 请安装LibreOffice (sudo apt-get install libreoffice)")
        print("  - Windows: 请安装LibreOffice并确保其在PATH中")
        sys.exit(1)

    # 检查其他Python依赖
    try:
        import docx
    except ImportError:
        print("警告: 未安装python-docx。建议安装以提供更好的.docx文件支持。")
        print("  pip install python-docx")

    try:
        import requests
    except ImportError:
        print("错误: 未安装requests库,该库是必需的。")
        print("  pip install requests")
        sys.exit(1)

    folder_path = input("\n请输入包含文档的文件夹路径: ")
    api_url = input("请输入API链接 (例如: http://your-api-server/v1/chat/completions): ")
    api_key = input("请输入API密钥 (如不需要可留空): ")
    model_name = input("请输入模型名称 (例如: gpt-3.5-turbo): ")

    print("\n开始处理文件...")
    rename_documents(folder_path, api_url, api_key, model_name)

评论