加入markdown转word功能

This commit is contained in:
William Jin 2025-10-11 10:36:24 +08:00
parent 0bcfb262dc
commit 7ef1d542b3
4 changed files with 289 additions and 9 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
.env
.env
/word_output

View File

@ -1 +1,24 @@
# chyoso-toolkit
# chyoso-toolkit
大模型工具集
## 安装依赖
```bash
pip install -r requirements.txt
```
## 功能列表
- 音频转写
- 音频批量转写
- Markdown 转 Word纯 Python 实现,无需安装 pandoc
- PDF 下载
- 图片 OCR 识别
- 批量图片识别
## 运行
```bash
python chyoso_toolkit_ui.py
```

View File

@ -5,9 +5,15 @@ import os
import re
import io
import base64
import datetime
from dotenv import load_dotenv
from openai import OpenAI
from PIL import Image
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
import markdown
from html.parser import HTMLParser
load_dotenv()
@ -143,6 +149,244 @@ def convert_to_docx(text):
return "file.docx"
class MarkdownToDocxParser(HTMLParser):
"""解析 HTML 并转换为 Word 文档"""
def __init__(self, document):
super().__init__()
self.doc = document
self.current_paragraph = None
self.current_run = None
self.in_bold = False
self.in_italic = False
self.in_code = False
self.in_heading = False
self.heading_level = 0
self.list_items = []
# 表格相关
self.in_table = False
self.table_depth = 0 # 追踪表格嵌套深度
self.current_table = None
self.current_row = None
self.current_cell = None
self.table_rows = []
self.current_row_cells = []
self.current_cell_content = [] # 存储单元格内容(包括格式)
self.is_header_row = False
def handle_starttag(self, tag, attrs):
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self.in_heading = True
self.heading_level = int(tag[1])
self.current_paragraph = self.doc.add_heading(level=self.heading_level)
self.current_paragraph.text = ''
elif tag == 'p':
if not self.in_table: # 不在表格中才创建段落
self.current_paragraph = self.doc.add_paragraph()
elif self.current_cell is not None:
# 在表格单元格中,记录换行
self.current_cell_content.append({'type': 'break'})
elif tag == 'strong' or tag == 'b':
self.in_bold = True
elif tag == 'em' or tag == 'i':
self.in_italic = True
elif tag == 'code':
self.in_code = True
elif tag == 'li':
if not self.in_table:
self.current_paragraph = self.doc.add_paragraph(style='List Bullet')
elif tag == 'br':
if self.in_table and self.current_cell is not None:
self.current_cell_content.append({'type': 'break'})
elif self.current_paragraph:
self.current_paragraph.add_run().add_break()
# 表格处理
elif tag == 'table':
self.table_depth += 1
if self.table_depth == 1: # 只处理最外层表格
self.in_table = True
self.table_rows = []
elif tag == 'thead':
if self.table_depth == 1:
self.is_header_row = True
elif tag == 'tbody':
if self.table_depth == 1:
self.is_header_row = False
elif tag == 'tr':
if self.table_depth == 1:
self.current_row_cells = []
elif tag == 'th' or tag == 'td':
if self.table_depth == 1:
self.current_cell = []
self.current_cell_content = []
if tag == 'th':
self.in_bold = True # 表头加粗
def handle_endtag(self, tag):
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self.in_heading = False
self.heading_level = 0
elif tag == 'p':
if not self.in_table:
self.current_paragraph = None
elif tag == 'strong' or tag == 'b':
self.in_bold = False
elif tag == 'em' or tag == 'i':
self.in_italic = False
elif tag == 'code':
self.in_code = False
elif tag == 'li':
if not self.in_table:
self.current_paragraph = None
# 表格处理
elif tag == 'table':
self.table_depth -= 1
if self.table_depth == 0:
self.in_table = False
self._create_table()
elif tag == 'thead':
if self.table_depth == 1:
self.is_header_row = False
elif tag == 'tr':
if self.table_depth == 1 and self.current_row_cells:
self.table_rows.append(self.current_row_cells)
self.current_row_cells = []
elif tag == 'th' or tag == 'td':
if self.table_depth == 1 and self.current_cell is not None:
# 保存单元格内容(包含格式信息)
self.current_row_cells.append(self.current_cell_content.copy())
self.current_cell = None
self.current_cell_content = []
if tag == 'th':
self.in_bold = False
def _create_table(self):
"""创建 Word 表格"""
if not self.table_rows:
return
# 计算列数
max_cols = max(len(row) for row in self.table_rows) if self.table_rows else 0
if max_cols == 0:
return
# 创建表格
table = self.doc.add_table(rows=len(self.table_rows), cols=max_cols)
table.style = 'Light Grid Accent 1'
# 填充数据
for i, row_data in enumerate(self.table_rows):
row = table.rows[i]
for j, cell_content_list in enumerate(row_data):
if j >= len(row.cells):
continue
cell = row.cells[j]
# 清空默认段落
cell.text = ''
para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
# 处理单元格内容(支持格式和换行)
for content_item in cell_content_list:
if isinstance(content_item, dict):
if content_item.get('type') == 'break':
para.add_run().add_break()
else:
# 文本内容
text, is_bold, is_italic = content_item
run = para.add_run(text)
if is_bold:
run.bold = True
if is_italic:
run.italic = True
# 第一行加粗(表头)
if i == 0:
for paragraph in cell.paragraphs:
for run in paragraph.runs:
run.bold = True
self.table_rows = []
def handle_data(self, data):
if not data.strip() and not self.in_table:
return
# 如果在表格单元格中(只处理最外层表格)
if self.current_cell is not None and self.table_depth == 1:
# 保存文本及其格式
self.current_cell_content.append((data, self.in_bold, self.in_italic))
return
if not self.current_paragraph and not self.in_table:
self.current_paragraph = self.doc.add_paragraph()
if self.current_paragraph:
run = self.current_paragraph.add_run(data)
if self.in_bold:
run.bold = True
if self.in_italic:
run.italic = True
if self.in_code:
run.font.name = 'Courier New'
run.font.size = Pt(10)
def convert_markdown_to_word(markdown_text):
"""将 Markdown 文本转换为 Word 文档并保存到 word_output 目录(使用纯 Python 实现)"""
if not markdown_text or markdown_text.strip() == "":
return "请输入 Markdown 内容!", None
# 确保输出目录存在
output_dir = "word_output"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 生成文件名(使用时间戳)
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
docx_filename = f"document_{timestamp}.docx"
docx_path = os.path.join(output_dir, docx_filename)
try:
# 将 Markdown 转换为 HTML启用表格扩展
html_content = markdown.markdown(
markdown_text,
extensions=['extra', 'nl2br', 'tables']
)
# 创建 Word 文档
doc = Document()
# 解析 HTML 并添加到 Word 文档
parser = MarkdownToDocxParser(doc)
parser.feed(html_content)
# 保存文档
doc.save(docx_path)
message = f"转换成功!文件已保存为: {docx_filename}"
# 只返回本次生成的文件
return message, docx_path
except Exception as e:
message = f"转换失败: {e}"
return message, None
def get_word_files():
"""获取 word_output 目录下的所有 Word 文档"""
output_dir = "word_output"
if not os.path.exists(output_dir):
return []
files = []
for filename in os.listdir(output_dir):
if filename.endswith('.docx'):
file_path = os.path.join(output_dir, filename)
files.append(file_path)
# 按修改时间倒序排列
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
return files
def remove_headers(curl_command):
# 使用正则表达式去除 if-none-match 和 range 标头
curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command)
@ -301,13 +545,23 @@ with gr.Blocks() as iface:
with gr.Tab("Markdown 转 Word"):
gr.Markdown("## Markdown 转 Word 转换器")
with gr.Row():
text_input = gr.Textbox(lines=10, placeholder="请在此输入 Markdown 内容...")
markdown_input = gr.Textbox(
lines=15,
placeholder="请在此输入 Markdown 内容...",
label="Markdown 内容"
)
with gr.Row():
convert_button = gr.Button("转换")
convert_md_button = gr.Button("转换为 Word", variant="primary")
with gr.Row():
output_file = gr.File(label="下载转换后的文件")
convert_button.click(convert_to_docx, inputs=text_input, outputs=output_file)
conversion_status = gr.Textbox(label="转换状态", interactive=False)
with gr.Row():
word_file_output = gr.File(label="生成的 Word 文档")
convert_md_button.click(
convert_markdown_to_word,
inputs=markdown_input,
outputs=[conversion_status, word_file_output]
)
with gr.Tab("下载pdf"):
gr.Markdown("## pdf 下载指令修复")
@ -353,4 +607,4 @@ with gr.Blocks() as iface:
timer = gr.Timer(1.0)
timer.tick(fn=update_log_output, outputs=[log_output])
iface.launch(server_name="0.0.0.0")
iface.launch(server_name="0.0.0.0", server_port=7861)

View File

@ -1,4 +1,6 @@
gradio==5.49.0
openai==1.44.1
python-dotenv~=1.0.1
pillow~=10.4.0
pillow~=10.4.0
python-docx~=1.1.0
markdown~=3.5