加入markdown转word功能

2025-10-11 10:36:24 +08:00 · 2025-10-11 10:36:24 +08:00 · 7ef1d542b3
commit 7ef1d542b3
parent 0bcfb262dc
4 changed files with 289 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
 .env
 /word_output
--- a/README.md
+++ b/README.md
@ -1 +1,24 @@
 # chyoso-toolkit
 大模型工具集
 ## 安装依赖
 ```bash
 pip install -r requirements.txt
 ```
 ## 功能列表
 - 音频转写
 - 音频批量转写
 - Markdown 转 Word（纯 Python 实现，无需安装 pandoc）
 - PDF 下载
 - 图片 OCR 识别
 - 批量图片识别
 ## 运行
 ```bash
 python chyoso_toolkit_ui.py
 ```
--- a/chyoso_toolkit_ui.py
+++ b/chyoso_toolkit_ui.py
@ -5,9 +5,15 @@ import os
 import re
 import io
 import base64
 import datetime
 from dotenv import load_dotenv
 from openai import OpenAI
 from PIL import Image
 from docx import Document
 from docx.shared import Pt, Inches
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 import markdown
 from html.parser import HTMLParser
 load_dotenv()
@ -143,6 +149,244 @@ def convert_to_docx(text):
    return "file.docx"
 class MarkdownToDocxParser(HTMLParser):
    """解析 HTML 并转换为 Word 文档"""
    def __init__(self, document):
        super().__init__()
        self.doc = document
        self.current_paragraph = None
        self.current_run = None
        self.in_bold = False
        self.in_italic = False
        self.in_code = False
        self.in_heading = False
        self.heading_level = 0
        self.list_items = []
        # 表格相关
        self.in_table = False
        self.table_depth = 0  # 追踪表格嵌套深度
        self.current_table = None
        self.current_row = None
        self.current_cell = None
        self.table_rows = []
        self.current_row_cells = []
        self.current_cell_content = []  # 存储单元格内容（包括格式）
        self.is_header_row = False
    def handle_starttag(self, tag, attrs):
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            self.in_heading = True
            self.heading_level = int(tag[1])
            self.current_paragraph = self.doc.add_heading(level=self.heading_level)
            self.current_paragraph.text = ''
        elif tag == 'p':
            if not self.in_table:  # 不在表格中才创建段落
                self.current_paragraph = self.doc.add_paragraph()
            elif self.current_cell is not None:
                # 在表格单元格中，记录换行
                self.current_cell_content.append({'type': 'break'})
        elif tag == 'strong' or tag == 'b':
            self.in_bold = True
        elif tag == 'em' or tag == 'i':
            self.in_italic = True
        elif tag == 'code':
            self.in_code = True
        elif tag == 'li':
            if not self.in_table:
                self.current_paragraph = self.doc.add_paragraph(style='List Bullet')
        elif tag == 'br':
            if self.in_table and self.current_cell is not None:
                self.current_cell_content.append({'type': 'break'})
            elif self.current_paragraph:
                self.current_paragraph.add_run().add_break()
        # 表格处理
        elif tag == 'table':
            self.table_depth += 1
            if self.table_depth == 1:  # 只处理最外层表格
                self.in_table = True
                self.table_rows = []
        elif tag == 'thead':
            if self.table_depth == 1:
                self.is_header_row = True
        elif tag == 'tbody':
            if self.table_depth == 1:
                self.is_header_row = False
        elif tag == 'tr':
            if self.table_depth == 1:
                self.current_row_cells = []
        elif tag == 'th' or tag == 'td':
            if self.table_depth == 1:
                self.current_cell = []
                self.current_cell_content = []
                if tag == 'th':
                    self.in_bold = True  # 表头加粗
    def handle_endtag(self, tag):
        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            self.in_heading = False
            self.heading_level = 0
        elif tag == 'p':
            if not self.in_table:
                self.current_paragraph = None
        elif tag == 'strong' or tag == 'b':
            self.in_bold = False
        elif tag == 'em' or tag == 'i':
            self.in_italic = False
        elif tag == 'code':
            self.in_code = False
        elif tag == 'li':
            if not self.in_table:
                self.current_paragraph = None
        # 表格处理
        elif tag == 'table':
            self.table_depth -= 1
            if self.table_depth == 0:
                self.in_table = False
                self._create_table()
        elif tag == 'thead':
            if self.table_depth == 1:
                self.is_header_row = False
        elif tag == 'tr':
            if self.table_depth == 1 and self.current_row_cells:
                self.table_rows.append(self.current_row_cells)
                self.current_row_cells = []
        elif tag == 'th' or tag == 'td':
            if self.table_depth == 1 and self.current_cell is not None:
                # 保存单元格内容（包含格式信息）
                self.current_row_cells.append(self.current_cell_content.copy())
                self.current_cell = None
                self.current_cell_content = []
            if tag == 'th':
                self.in_bold = False
    def _create_table(self):
        """创建 Word 表格"""
        if not self.table_rows:
            return
        # 计算列数
        max_cols = max(len(row) for row in self.table_rows) if self.table_rows else 0
        if max_cols == 0:
            return
        # 创建表格
        table = self.doc.add_table(rows=len(self.table_rows), cols=max_cols)
        table.style = 'Light Grid Accent 1'
        # 填充数据
        for i, row_data in enumerate(self.table_rows):
            row = table.rows[i]
            for j, cell_content_list in enumerate(row_data):
                if j >= len(row.cells):
                    continue
                cell = row.cells[j]
                # 清空默认段落
                cell.text = ''
                para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
                # 处理单元格内容（支持格式和换行）
                for content_item in cell_content_list:
                    if isinstance(content_item, dict):
                        if content_item.get('type') == 'break':
                            para.add_run().add_break()
                    else:
                        # 文本内容
                        text, is_bold, is_italic = content_item
                        run = para.add_run(text)
                        if is_bold:
                            run.bold = True
                        if is_italic:
                            run.italic = True
                # 第一行加粗（表头）
                if i == 0:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            run.bold = True
        self.table_rows = []
    def handle_data(self, data):
        if not data.strip() and not self.in_table:
            return
        # 如果在表格单元格中（只处理最外层表格）
        if self.current_cell is not None and self.table_depth == 1:
            # 保存文本及其格式
            self.current_cell_content.append((data, self.in_bold, self.in_italic))
            return
        if not self.current_paragraph and not self.in_table:
            self.current_paragraph = self.doc.add_paragraph()
        if self.current_paragraph:
            run = self.current_paragraph.add_run(data)
            if self.in_bold:
                run.bold = True
            if self.in_italic:
                run.italic = True
            if self.in_code:
                run.font.name = 'Courier New'
                run.font.size = Pt(10)
 def convert_markdown_to_word(markdown_text):
    """将 Markdown 文本转换为 Word 文档并保存到 word_output 目录（使用纯 Python 实现）"""
    if not markdown_text or markdown_text.strip() == "":
        return "请输入 Markdown 内容！", None
    # 确保输出目录存在
    output_dir = "word_output"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # 生成文件名（使用时间戳）
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    docx_filename = f"document_{timestamp}.docx"
    docx_path = os.path.join(output_dir, docx_filename)
    try:
        # 将 Markdown 转换为 HTML（启用表格扩展）
        html_content = markdown.markdown(
            markdown_text, 
            extensions=['extra', 'nl2br', 'tables']
        )
        # 创建 Word 文档
        doc = Document()
        # 解析 HTML 并添加到 Word 文档
        parser = MarkdownToDocxParser(doc)
        parser.feed(html_content)
        # 保存文档
        doc.save(docx_path)
        message = f"转换成功！文件已保存为: {docx_filename}"
        # 只返回本次生成的文件
        return message, docx_path
    except Exception as e:
        message = f"转换失败: {e}"
        return message, None
 def get_word_files():
    """获取 word_output 目录下的所有 Word 文档"""
    output_dir = "word_output"
    if not os.path.exists(output_dir):
        return []
    files = []
    for filename in os.listdir(output_dir):
        if filename.endswith('.docx'):
            file_path = os.path.join(output_dir, filename)
            files.append(file_path)
    # 按修改时间倒序排列
    files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
    return files
 def remove_headers(curl_command):
    # 使用正则表达式去除 if-none-match 和 range 标头
    curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command)
@ -301,13 +545,23 @@ with gr.Blocks() as iface:
        with gr.Tab("Markdown 转 Word"):
            gr.Markdown("## Markdown 转 Word 转换器")
            with gr.Row():
-                text_input = gr.Textbox(lines=10, placeholder="请在此输入 Markdown 内容...")
+                markdown_input = gr.Textbox(
                    lines=15, 
                    placeholder="请在此输入 Markdown 内容...",
                    label="Markdown 内容"
                )
            with gr.Row():
-                convert_button = gr.Button("转换")
+                convert_md_button = gr.Button("转换为 Word", variant="primary")
            with gr.Row():
-                output_file = gr.File(label="下载转换后的文件")
+                conversion_status = gr.Textbox(label="转换状态", interactive=False)
            with gr.Row():
                word_file_output = gr.File(label="生成的 Word 文档")
-            convert_button.click(convert_to_docx, inputs=text_input, outputs=output_file)
+            convert_md_button.click(
                convert_markdown_to_word, 
                inputs=markdown_input, 
                outputs=[conversion_status, word_file_output]
            )
        with gr.Tab("下载pdf"):
            gr.Markdown("## pdf 下载指令修复")
@ -353,4 +607,4 @@ with gr.Blocks() as iface:
    timer = gr.Timer(1.0)
    timer.tick(fn=update_log_output, outputs=[log_output])
-iface.launch(server_name="0.0.0.0")
+iface.launch(server_name="0.0.0.0", server_port=7861)
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,5 @@ gradio==5.49.0
 openai==1.44.1
 python-dotenv~=1.0.1
 pillow~=10.4.0
 python-docx~=1.1.0
 markdown~=3.5