加入markdown转word功能

2025-10-11 10:36:24 +08:00 · 2025-10-11 10:36:24 +08:00 · 7ef1d542b3
commit 7ef1d542b3
parent 0bcfb262dc
4 changed files with 289 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-.env
+.env
+/word_output
--- a/README.md
+++ b/README.md
@ -1 +1,24 @@
-# chyoso-toolkit
+# chyoso-toolkit
+
+大模型工具集
+
+## 安装依赖
+
+```bash
+pip install -r requirements.txt
+```
+
+## 功能列表
+
+- 音频转写
+- 音频批量转写
+- Markdown 转 Word（纯 Python 实现，无需安装 pandoc）
+- PDF 下载
+- 图片 OCR 识别
+- 批量图片识别
+
+## 运行
+
+```bash
+python chyoso_toolkit_ui.py
+```
--- a/chyoso_toolkit_ui.py
+++ b/chyoso_toolkit_ui.py
@ -5,9 +5,15 @@ import os
 import re
 import io
 import base64
+import datetime
 from dotenv import load_dotenv
 from openai import OpenAI
 from PIL import Image
+from docx import Document
+from docx.shared import Pt, Inches
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+import markdown
+from html.parser import HTMLParser

 load_dotenv()

@ -143,6 +149,244 @@ def convert_to_docx(text):
    return "file.docx"


+class MarkdownToDocxParser(HTMLParser):
+    """解析 HTML 并转换为 Word 文档"""
+    def __init__(self, document):
+        super().__init__()
+        self.doc = document
+        self.current_paragraph = None
+        self.current_run = None
+        self.in_bold = False
+        self.in_italic = False
+        self.in_code = False
+        self.in_heading = False
+        self.heading_level = 0
+        self.list_items = []
+        # 表格相关
+        self.in_table = False
+        self.table_depth = 0  # 追踪表格嵌套深度
+        self.current_table = None
+        self.current_row = None
+        self.current_cell = None
+        self.table_rows = []
+        self.current_row_cells = []
+        self.current_cell_content = []  # 存储单元格内容（包括格式）
+        self.is_header_row = False
+        
+    def handle_starttag(self, tag, attrs):
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            self.in_heading = True
+            self.heading_level = int(tag[1])
+            self.current_paragraph = self.doc.add_heading(level=self.heading_level)
+            self.current_paragraph.text = ''
+        elif tag == 'p':
+            if not self.in_table:  # 不在表格中才创建段落
+                self.current_paragraph = self.doc.add_paragraph()
+            elif self.current_cell is not None:
+                # 在表格单元格中，记录换行
+                self.current_cell_content.append({'type': 'break'})
+        elif tag == 'strong' or tag == 'b':
+            self.in_bold = True
+        elif tag == 'em' or tag == 'i':
+            self.in_italic = True
+        elif tag == 'code':
+            self.in_code = True
+        elif tag == 'li':
+            if not self.in_table:
+                self.current_paragraph = self.doc.add_paragraph(style='List Bullet')
+        elif tag == 'br':
+            if self.in_table and self.current_cell is not None:
+                self.current_cell_content.append({'type': 'break'})
+            elif self.current_paragraph:
+                self.current_paragraph.add_run().add_break()
+        # 表格处理
+        elif tag == 'table':
+            self.table_depth += 1
+            if self.table_depth == 1:  # 只处理最外层表格
+                self.in_table = True
+                self.table_rows = []
+        elif tag == 'thead':
+            if self.table_depth == 1:
+                self.is_header_row = True
+        elif tag == 'tbody':
+            if self.table_depth == 1:
+                self.is_header_row = False
+        elif tag == 'tr':
+            if self.table_depth == 1:
+                self.current_row_cells = []
+        elif tag == 'th' or tag == 'td':
+            if self.table_depth == 1:
+                self.current_cell = []
+                self.current_cell_content = []
+                if tag == 'th':
+                    self.in_bold = True  # 表头加粗
+                
+    def handle_endtag(self, tag):
+        if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            self.in_heading = False
+            self.heading_level = 0
+        elif tag == 'p':
+            if not self.in_table:
+                self.current_paragraph = None
+        elif tag == 'strong' or tag == 'b':
+            self.in_bold = False
+        elif tag == 'em' or tag == 'i':
+            self.in_italic = False
+        elif tag == 'code':
+            self.in_code = False
+        elif tag == 'li':
+            if not self.in_table:
+                self.current_paragraph = None
+        # 表格处理
+        elif tag == 'table':
+            self.table_depth -= 1
+            if self.table_depth == 0:
+                self.in_table = False
+                self._create_table()
+        elif tag == 'thead':
+            if self.table_depth == 1:
+                self.is_header_row = False
+        elif tag == 'tr':
+            if self.table_depth == 1 and self.current_row_cells:
+                self.table_rows.append(self.current_row_cells)
+                self.current_row_cells = []
+        elif tag == 'th' or tag == 'td':
+            if self.table_depth == 1 and self.current_cell is not None:
+                # 保存单元格内容（包含格式信息）
+                self.current_row_cells.append(self.current_cell_content.copy())
+                self.current_cell = None
+                self.current_cell_content = []
+            if tag == 'th':
+                self.in_bold = False
+    
+    def _create_table(self):
+        """创建 Word 表格"""
+        if not self.table_rows:
+            return
+        
+        # 计算列数
+        max_cols = max(len(row) for row in self.table_rows) if self.table_rows else 0
+        if max_cols == 0:
+            return
+        
+        # 创建表格
+        table = self.doc.add_table(rows=len(self.table_rows), cols=max_cols)
+        table.style = 'Light Grid Accent 1'
+        
+        # 填充数据
+        for i, row_data in enumerate(self.table_rows):
+            row = table.rows[i]
+            for j, cell_content_list in enumerate(row_data):
+                if j >= len(row.cells):
+                    continue
+                cell = row.cells[j]
+                # 清空默认段落
+                cell.text = ''
+                para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
+                
+                # 处理单元格内容（支持格式和换行）
+                for content_item in cell_content_list:
+                    if isinstance(content_item, dict):
+                        if content_item.get('type') == 'break':
+                            para.add_run().add_break()
+                    else:
+                        # 文本内容
+                        text, is_bold, is_italic = content_item
+                        run = para.add_run(text)
+                        if is_bold:
+                            run.bold = True
+                        if is_italic:
+                            run.italic = True
+                
+                # 第一行加粗（表头）
+                if i == 0:
+                    for paragraph in cell.paragraphs:
+                        for run in paragraph.runs:
+                            run.bold = True
+        
+        self.table_rows = []
+            
+    def handle_data(self, data):
+        if not data.strip() and not self.in_table:
+            return
+        
+        # 如果在表格单元格中（只处理最外层表格）
+        if self.current_cell is not None and self.table_depth == 1:
+            # 保存文本及其格式
+            self.current_cell_content.append((data, self.in_bold, self.in_italic))
+            return
+            
+        if not self.current_paragraph and not self.in_table:
+            self.current_paragraph = self.doc.add_paragraph()
+        
+        if self.current_paragraph:
+            run = self.current_paragraph.add_run(data)
+            
+            if self.in_bold:
+                run.bold = True
+            if self.in_italic:
+                run.italic = True
+            if self.in_code:
+                run.font.name = 'Courier New'
+                run.font.size = Pt(10)
+
+
+def convert_markdown_to_word(markdown_text):
+    """将 Markdown 文本转换为 Word 文档并保存到 word_output 目录（使用纯 Python 实现）"""
+    if not markdown_text or markdown_text.strip() == "":
+        return "请输入 Markdown 内容！", None
+    
+    # 确保输出目录存在
+    output_dir = "word_output"
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+    
+    # 生成文件名（使用时间戳）
+    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+    docx_filename = f"document_{timestamp}.docx"
+    docx_path = os.path.join(output_dir, docx_filename)
+    
+    try:
+        # 将 Markdown 转换为 HTML（启用表格扩展）
+        html_content = markdown.markdown(
+            markdown_text, 
+            extensions=['extra', 'nl2br', 'tables']
+        )
+        
+        # 创建 Word 文档
+        doc = Document()
+        
+        # 解析 HTML 并添加到 Word 文档
+        parser = MarkdownToDocxParser(doc)
+        parser.feed(html_content)
+        
+        # 保存文档
+        doc.save(docx_path)
+        message = f"转换成功！文件已保存为: {docx_filename}"
+        # 只返回本次生成的文件
+        return message, docx_path
+    except Exception as e:
+        message = f"转换失败: {e}"
+        return message, None
+
+
+def get_word_files():
+    """获取 word_output 目录下的所有 Word 文档"""
+    output_dir = "word_output"
+    if not os.path.exists(output_dir):
+        return []
+    
+    files = []
+    for filename in os.listdir(output_dir):
+        if filename.endswith('.docx'):
+            file_path = os.path.join(output_dir, filename)
+            files.append(file_path)
+    
+    # 按修改时间倒序排列
+    files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
+    return files
+
+
 def remove_headers(curl_command):
    # 使用正则表达式去除 if-none-match 和 range 标头
    curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command)
@ -301,13 +545,23 @@ with gr.Blocks() as iface:
        with gr.Tab("Markdown 转 Word"):
            gr.Markdown("## Markdown 转 Word 转换器")
            with gr.Row():
-                text_input = gr.Textbox(lines=10, placeholder="请在此输入 Markdown 内容...")
+                markdown_input = gr.Textbox(
+                    lines=15, 
+                    placeholder="请在此输入 Markdown 内容...",
+                    label="Markdown 内容"
+                )
            with gr.Row():
-                convert_button = gr.Button("转换")
+                convert_md_button = gr.Button("转换为 Word", variant="primary")
            with gr.Row():
-                output_file = gr.File(label="下载转换后的文件")
-
-            convert_button.click(convert_to_docx, inputs=text_input, outputs=output_file)
+                conversion_status = gr.Textbox(label="转换状态", interactive=False)
+            with gr.Row():
+                word_file_output = gr.File(label="生成的 Word 文档")
+            
+            convert_md_button.click(
+                convert_markdown_to_word, 
+                inputs=markdown_input, 
+                outputs=[conversion_status, word_file_output]
+            )

        with gr.Tab("下载pdf"):
            gr.Markdown("## pdf 下载指令修复")
@ -353,4 +607,4 @@ with gr.Blocks() as iface:
    timer = gr.Timer(1.0)
    timer.tick(fn=update_log_output, outputs=[log_output])

-iface.launch(server_name="0.0.0.0")
+iface.launch(server_name="0.0.0.0", server_port=7861)
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,6 @@
 gradio==5.49.0
 openai==1.44.1
 python-dotenv~=1.0.1
-pillow~=10.4.0
+pillow~=10.4.0
+python-docx~=1.1.0
+markdown~=3.5