加入markdown转word功能
This commit is contained in:
parent
0bcfb262dc
commit
7ef1d542b3
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
.env
|
||||
/word_output
|
||||
23
README.md
23
README.md
@ -1 +1,24 @@
|
||||
# chyoso-toolkit
|
||||
|
||||
大模型工具集
|
||||
|
||||
## 安装依赖
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## 功能列表
|
||||
|
||||
- 音频转写
|
||||
- 音频批量转写
|
||||
- Markdown 转 Word(纯 Python 实现,无需安装 pandoc)
|
||||
- PDF 下载
|
||||
- 图片 OCR 识别
|
||||
- 批量图片识别
|
||||
|
||||
## 运行
|
||||
|
||||
```bash
|
||||
python chyoso_toolkit_ui.py
|
||||
```
|
||||
@ -5,9 +5,15 @@ import os
|
||||
import re
|
||||
import io
|
||||
import base64
|
||||
import datetime
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
from PIL import Image
|
||||
from docx import Document
|
||||
from docx.shared import Pt, Inches
|
||||
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||
import markdown
|
||||
from html.parser import HTMLParser
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -143,6 +149,244 @@ def convert_to_docx(text):
|
||||
return "file.docx"
|
||||
|
||||
|
||||
class MarkdownToDocxParser(HTMLParser):
|
||||
"""解析 HTML 并转换为 Word 文档"""
|
||||
def __init__(self, document):
|
||||
super().__init__()
|
||||
self.doc = document
|
||||
self.current_paragraph = None
|
||||
self.current_run = None
|
||||
self.in_bold = False
|
||||
self.in_italic = False
|
||||
self.in_code = False
|
||||
self.in_heading = False
|
||||
self.heading_level = 0
|
||||
self.list_items = []
|
||||
# 表格相关
|
||||
self.in_table = False
|
||||
self.table_depth = 0 # 追踪表格嵌套深度
|
||||
self.current_table = None
|
||||
self.current_row = None
|
||||
self.current_cell = None
|
||||
self.table_rows = []
|
||||
self.current_row_cells = []
|
||||
self.current_cell_content = [] # 存储单元格内容(包括格式)
|
||||
self.is_header_row = False
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
self.in_heading = True
|
||||
self.heading_level = int(tag[1])
|
||||
self.current_paragraph = self.doc.add_heading(level=self.heading_level)
|
||||
self.current_paragraph.text = ''
|
||||
elif tag == 'p':
|
||||
if not self.in_table: # 不在表格中才创建段落
|
||||
self.current_paragraph = self.doc.add_paragraph()
|
||||
elif self.current_cell is not None:
|
||||
# 在表格单元格中,记录换行
|
||||
self.current_cell_content.append({'type': 'break'})
|
||||
elif tag == 'strong' or tag == 'b':
|
||||
self.in_bold = True
|
||||
elif tag == 'em' or tag == 'i':
|
||||
self.in_italic = True
|
||||
elif tag == 'code':
|
||||
self.in_code = True
|
||||
elif tag == 'li':
|
||||
if not self.in_table:
|
||||
self.current_paragraph = self.doc.add_paragraph(style='List Bullet')
|
||||
elif tag == 'br':
|
||||
if self.in_table and self.current_cell is not None:
|
||||
self.current_cell_content.append({'type': 'break'})
|
||||
elif self.current_paragraph:
|
||||
self.current_paragraph.add_run().add_break()
|
||||
# 表格处理
|
||||
elif tag == 'table':
|
||||
self.table_depth += 1
|
||||
if self.table_depth == 1: # 只处理最外层表格
|
||||
self.in_table = True
|
||||
self.table_rows = []
|
||||
elif tag == 'thead':
|
||||
if self.table_depth == 1:
|
||||
self.is_header_row = True
|
||||
elif tag == 'tbody':
|
||||
if self.table_depth == 1:
|
||||
self.is_header_row = False
|
||||
elif tag == 'tr':
|
||||
if self.table_depth == 1:
|
||||
self.current_row_cells = []
|
||||
elif tag == 'th' or tag == 'td':
|
||||
if self.table_depth == 1:
|
||||
self.current_cell = []
|
||||
self.current_cell_content = []
|
||||
if tag == 'th':
|
||||
self.in_bold = True # 表头加粗
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||
self.in_heading = False
|
||||
self.heading_level = 0
|
||||
elif tag == 'p':
|
||||
if not self.in_table:
|
||||
self.current_paragraph = None
|
||||
elif tag == 'strong' or tag == 'b':
|
||||
self.in_bold = False
|
||||
elif tag == 'em' or tag == 'i':
|
||||
self.in_italic = False
|
||||
elif tag == 'code':
|
||||
self.in_code = False
|
||||
elif tag == 'li':
|
||||
if not self.in_table:
|
||||
self.current_paragraph = None
|
||||
# 表格处理
|
||||
elif tag == 'table':
|
||||
self.table_depth -= 1
|
||||
if self.table_depth == 0:
|
||||
self.in_table = False
|
||||
self._create_table()
|
||||
elif tag == 'thead':
|
||||
if self.table_depth == 1:
|
||||
self.is_header_row = False
|
||||
elif tag == 'tr':
|
||||
if self.table_depth == 1 and self.current_row_cells:
|
||||
self.table_rows.append(self.current_row_cells)
|
||||
self.current_row_cells = []
|
||||
elif tag == 'th' or tag == 'td':
|
||||
if self.table_depth == 1 and self.current_cell is not None:
|
||||
# 保存单元格内容(包含格式信息)
|
||||
self.current_row_cells.append(self.current_cell_content.copy())
|
||||
self.current_cell = None
|
||||
self.current_cell_content = []
|
||||
if tag == 'th':
|
||||
self.in_bold = False
|
||||
|
||||
def _create_table(self):
|
||||
"""创建 Word 表格"""
|
||||
if not self.table_rows:
|
||||
return
|
||||
|
||||
# 计算列数
|
||||
max_cols = max(len(row) for row in self.table_rows) if self.table_rows else 0
|
||||
if max_cols == 0:
|
||||
return
|
||||
|
||||
# 创建表格
|
||||
table = self.doc.add_table(rows=len(self.table_rows), cols=max_cols)
|
||||
table.style = 'Light Grid Accent 1'
|
||||
|
||||
# 填充数据
|
||||
for i, row_data in enumerate(self.table_rows):
|
||||
row = table.rows[i]
|
||||
for j, cell_content_list in enumerate(row_data):
|
||||
if j >= len(row.cells):
|
||||
continue
|
||||
cell = row.cells[j]
|
||||
# 清空默认段落
|
||||
cell.text = ''
|
||||
para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
|
||||
|
||||
# 处理单元格内容(支持格式和换行)
|
||||
for content_item in cell_content_list:
|
||||
if isinstance(content_item, dict):
|
||||
if content_item.get('type') == 'break':
|
||||
para.add_run().add_break()
|
||||
else:
|
||||
# 文本内容
|
||||
text, is_bold, is_italic = content_item
|
||||
run = para.add_run(text)
|
||||
if is_bold:
|
||||
run.bold = True
|
||||
if is_italic:
|
||||
run.italic = True
|
||||
|
||||
# 第一行加粗(表头)
|
||||
if i == 0:
|
||||
for paragraph in cell.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
run.bold = True
|
||||
|
||||
self.table_rows = []
|
||||
|
||||
def handle_data(self, data):
|
||||
if not data.strip() and not self.in_table:
|
||||
return
|
||||
|
||||
# 如果在表格单元格中(只处理最外层表格)
|
||||
if self.current_cell is not None and self.table_depth == 1:
|
||||
# 保存文本及其格式
|
||||
self.current_cell_content.append((data, self.in_bold, self.in_italic))
|
||||
return
|
||||
|
||||
if not self.current_paragraph and not self.in_table:
|
||||
self.current_paragraph = self.doc.add_paragraph()
|
||||
|
||||
if self.current_paragraph:
|
||||
run = self.current_paragraph.add_run(data)
|
||||
|
||||
if self.in_bold:
|
||||
run.bold = True
|
||||
if self.in_italic:
|
||||
run.italic = True
|
||||
if self.in_code:
|
||||
run.font.name = 'Courier New'
|
||||
run.font.size = Pt(10)
|
||||
|
||||
|
||||
def convert_markdown_to_word(markdown_text):
|
||||
"""将 Markdown 文本转换为 Word 文档并保存到 word_output 目录(使用纯 Python 实现)"""
|
||||
if not markdown_text or markdown_text.strip() == "":
|
||||
return "请输入 Markdown 内容!", None
|
||||
|
||||
# 确保输出目录存在
|
||||
output_dir = "word_output"
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
# 生成文件名(使用时间戳)
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
docx_filename = f"document_{timestamp}.docx"
|
||||
docx_path = os.path.join(output_dir, docx_filename)
|
||||
|
||||
try:
|
||||
# 将 Markdown 转换为 HTML(启用表格扩展)
|
||||
html_content = markdown.markdown(
|
||||
markdown_text,
|
||||
extensions=['extra', 'nl2br', 'tables']
|
||||
)
|
||||
|
||||
# 创建 Word 文档
|
||||
doc = Document()
|
||||
|
||||
# 解析 HTML 并添加到 Word 文档
|
||||
parser = MarkdownToDocxParser(doc)
|
||||
parser.feed(html_content)
|
||||
|
||||
# 保存文档
|
||||
doc.save(docx_path)
|
||||
message = f"转换成功!文件已保存为: {docx_filename}"
|
||||
# 只返回本次生成的文件
|
||||
return message, docx_path
|
||||
except Exception as e:
|
||||
message = f"转换失败: {e}"
|
||||
return message, None
|
||||
|
||||
|
||||
def get_word_files():
|
||||
"""获取 word_output 目录下的所有 Word 文档"""
|
||||
output_dir = "word_output"
|
||||
if not os.path.exists(output_dir):
|
||||
return []
|
||||
|
||||
files = []
|
||||
for filename in os.listdir(output_dir):
|
||||
if filename.endswith('.docx'):
|
||||
file_path = os.path.join(output_dir, filename)
|
||||
files.append(file_path)
|
||||
|
||||
# 按修改时间倒序排列
|
||||
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
||||
return files
|
||||
|
||||
|
||||
def remove_headers(curl_command):
|
||||
# 使用正则表达式去除 if-none-match 和 range 标头
|
||||
curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command)
|
||||
@ -301,13 +545,23 @@ with gr.Blocks() as iface:
|
||||
with gr.Tab("Markdown 转 Word"):
|
||||
gr.Markdown("## Markdown 转 Word 转换器")
|
||||
with gr.Row():
|
||||
text_input = gr.Textbox(lines=10, placeholder="请在此输入 Markdown 内容...")
|
||||
markdown_input = gr.Textbox(
|
||||
lines=15,
|
||||
placeholder="请在此输入 Markdown 内容...",
|
||||
label="Markdown 内容"
|
||||
)
|
||||
with gr.Row():
|
||||
convert_button = gr.Button("转换")
|
||||
convert_md_button = gr.Button("转换为 Word", variant="primary")
|
||||
with gr.Row():
|
||||
output_file = gr.File(label="下载转换后的文件")
|
||||
conversion_status = gr.Textbox(label="转换状态", interactive=False)
|
||||
with gr.Row():
|
||||
word_file_output = gr.File(label="生成的 Word 文档")
|
||||
|
||||
convert_button.click(convert_to_docx, inputs=text_input, outputs=output_file)
|
||||
convert_md_button.click(
|
||||
convert_markdown_to_word,
|
||||
inputs=markdown_input,
|
||||
outputs=[conversion_status, word_file_output]
|
||||
)
|
||||
|
||||
with gr.Tab("下载pdf"):
|
||||
gr.Markdown("## pdf 下载指令修复")
|
||||
@ -353,4 +607,4 @@ with gr.Blocks() as iface:
|
||||
timer = gr.Timer(1.0)
|
||||
timer.tick(fn=update_log_output, outputs=[log_output])
|
||||
|
||||
iface.launch(server_name="0.0.0.0")
|
||||
iface.launch(server_name="0.0.0.0", server_port=7861)
|
||||
|
||||
@ -2,3 +2,5 @@ gradio==5.49.0
|
||||
openai==1.44.1
|
||||
python-dotenv~=1.0.1
|
||||
pillow~=10.4.0
|
||||
python-docx~=1.1.0
|
||||
markdown~=3.5
|
||||
Loading…
x
Reference in New Issue
Block a user