加入markdown转word功能
This commit is contained in:
parent
0bcfb262dc
commit
7ef1d542b3
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
.env
|
.env
|
||||||
|
/word_output
|
||||||
23
README.md
23
README.md
@ -1 +1,24 @@
|
|||||||
# chyoso-toolkit
|
# chyoso-toolkit
|
||||||
|
|
||||||
|
大模型工具集
|
||||||
|
|
||||||
|
## 安装依赖
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## 功能列表
|
||||||
|
|
||||||
|
- 音频转写
|
||||||
|
- 音频批量转写
|
||||||
|
- Markdown 转 Word(纯 Python 实现,无需安装 pandoc)
|
||||||
|
- PDF 下载
|
||||||
|
- 图片 OCR 识别
|
||||||
|
- 批量图片识别
|
||||||
|
|
||||||
|
## 运行
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python chyoso_toolkit_ui.py
|
||||||
|
```
|
||||||
@ -5,9 +5,15 @@ import os
|
|||||||
import re
|
import re
|
||||||
import io
|
import io
|
||||||
import base64
|
import base64
|
||||||
|
import datetime
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from docx import Document
|
||||||
|
from docx.shared import Pt, Inches
|
||||||
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
|
||||||
|
import markdown
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
@ -143,6 +149,244 @@ def convert_to_docx(text):
|
|||||||
return "file.docx"
|
return "file.docx"
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownToDocxParser(HTMLParser):
|
||||||
|
"""解析 HTML 并转换为 Word 文档"""
|
||||||
|
def __init__(self, document):
|
||||||
|
super().__init__()
|
||||||
|
self.doc = document
|
||||||
|
self.current_paragraph = None
|
||||||
|
self.current_run = None
|
||||||
|
self.in_bold = False
|
||||||
|
self.in_italic = False
|
||||||
|
self.in_code = False
|
||||||
|
self.in_heading = False
|
||||||
|
self.heading_level = 0
|
||||||
|
self.list_items = []
|
||||||
|
# 表格相关
|
||||||
|
self.in_table = False
|
||||||
|
self.table_depth = 0 # 追踪表格嵌套深度
|
||||||
|
self.current_table = None
|
||||||
|
self.current_row = None
|
||||||
|
self.current_cell = None
|
||||||
|
self.table_rows = []
|
||||||
|
self.current_row_cells = []
|
||||||
|
self.current_cell_content = [] # 存储单元格内容(包括格式)
|
||||||
|
self.is_header_row = False
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
self.in_heading = True
|
||||||
|
self.heading_level = int(tag[1])
|
||||||
|
self.current_paragraph = self.doc.add_heading(level=self.heading_level)
|
||||||
|
self.current_paragraph.text = ''
|
||||||
|
elif tag == 'p':
|
||||||
|
if not self.in_table: # 不在表格中才创建段落
|
||||||
|
self.current_paragraph = self.doc.add_paragraph()
|
||||||
|
elif self.current_cell is not None:
|
||||||
|
# 在表格单元格中,记录换行
|
||||||
|
self.current_cell_content.append({'type': 'break'})
|
||||||
|
elif tag == 'strong' or tag == 'b':
|
||||||
|
self.in_bold = True
|
||||||
|
elif tag == 'em' or tag == 'i':
|
||||||
|
self.in_italic = True
|
||||||
|
elif tag == 'code':
|
||||||
|
self.in_code = True
|
||||||
|
elif tag == 'li':
|
||||||
|
if not self.in_table:
|
||||||
|
self.current_paragraph = self.doc.add_paragraph(style='List Bullet')
|
||||||
|
elif tag == 'br':
|
||||||
|
if self.in_table and self.current_cell is not None:
|
||||||
|
self.current_cell_content.append({'type': 'break'})
|
||||||
|
elif self.current_paragraph:
|
||||||
|
self.current_paragraph.add_run().add_break()
|
||||||
|
# 表格处理
|
||||||
|
elif tag == 'table':
|
||||||
|
self.table_depth += 1
|
||||||
|
if self.table_depth == 1: # 只处理最外层表格
|
||||||
|
self.in_table = True
|
||||||
|
self.table_rows = []
|
||||||
|
elif tag == 'thead':
|
||||||
|
if self.table_depth == 1:
|
||||||
|
self.is_header_row = True
|
||||||
|
elif tag == 'tbody':
|
||||||
|
if self.table_depth == 1:
|
||||||
|
self.is_header_row = False
|
||||||
|
elif tag == 'tr':
|
||||||
|
if self.table_depth == 1:
|
||||||
|
self.current_row_cells = []
|
||||||
|
elif tag == 'th' or tag == 'td':
|
||||||
|
if self.table_depth == 1:
|
||||||
|
self.current_cell = []
|
||||||
|
self.current_cell_content = []
|
||||||
|
if tag == 'th':
|
||||||
|
self.in_bold = True # 表头加粗
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
||||||
|
self.in_heading = False
|
||||||
|
self.heading_level = 0
|
||||||
|
elif tag == 'p':
|
||||||
|
if not self.in_table:
|
||||||
|
self.current_paragraph = None
|
||||||
|
elif tag == 'strong' or tag == 'b':
|
||||||
|
self.in_bold = False
|
||||||
|
elif tag == 'em' or tag == 'i':
|
||||||
|
self.in_italic = False
|
||||||
|
elif tag == 'code':
|
||||||
|
self.in_code = False
|
||||||
|
elif tag == 'li':
|
||||||
|
if not self.in_table:
|
||||||
|
self.current_paragraph = None
|
||||||
|
# 表格处理
|
||||||
|
elif tag == 'table':
|
||||||
|
self.table_depth -= 1
|
||||||
|
if self.table_depth == 0:
|
||||||
|
self.in_table = False
|
||||||
|
self._create_table()
|
||||||
|
elif tag == 'thead':
|
||||||
|
if self.table_depth == 1:
|
||||||
|
self.is_header_row = False
|
||||||
|
elif tag == 'tr':
|
||||||
|
if self.table_depth == 1 and self.current_row_cells:
|
||||||
|
self.table_rows.append(self.current_row_cells)
|
||||||
|
self.current_row_cells = []
|
||||||
|
elif tag == 'th' or tag == 'td':
|
||||||
|
if self.table_depth == 1 and self.current_cell is not None:
|
||||||
|
# 保存单元格内容(包含格式信息)
|
||||||
|
self.current_row_cells.append(self.current_cell_content.copy())
|
||||||
|
self.current_cell = None
|
||||||
|
self.current_cell_content = []
|
||||||
|
if tag == 'th':
|
||||||
|
self.in_bold = False
|
||||||
|
|
||||||
|
def _create_table(self):
|
||||||
|
"""创建 Word 表格"""
|
||||||
|
if not self.table_rows:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 计算列数
|
||||||
|
max_cols = max(len(row) for row in self.table_rows) if self.table_rows else 0
|
||||||
|
if max_cols == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 创建表格
|
||||||
|
table = self.doc.add_table(rows=len(self.table_rows), cols=max_cols)
|
||||||
|
table.style = 'Light Grid Accent 1'
|
||||||
|
|
||||||
|
# 填充数据
|
||||||
|
for i, row_data in enumerate(self.table_rows):
|
||||||
|
row = table.rows[i]
|
||||||
|
for j, cell_content_list in enumerate(row_data):
|
||||||
|
if j >= len(row.cells):
|
||||||
|
continue
|
||||||
|
cell = row.cells[j]
|
||||||
|
# 清空默认段落
|
||||||
|
cell.text = ''
|
||||||
|
para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph()
|
||||||
|
|
||||||
|
# 处理单元格内容(支持格式和换行)
|
||||||
|
for content_item in cell_content_list:
|
||||||
|
if isinstance(content_item, dict):
|
||||||
|
if content_item.get('type') == 'break':
|
||||||
|
para.add_run().add_break()
|
||||||
|
else:
|
||||||
|
# 文本内容
|
||||||
|
text, is_bold, is_italic = content_item
|
||||||
|
run = para.add_run(text)
|
||||||
|
if is_bold:
|
||||||
|
run.bold = True
|
||||||
|
if is_italic:
|
||||||
|
run.italic = True
|
||||||
|
|
||||||
|
# 第一行加粗(表头)
|
||||||
|
if i == 0:
|
||||||
|
for paragraph in cell.paragraphs:
|
||||||
|
for run in paragraph.runs:
|
||||||
|
run.bold = True
|
||||||
|
|
||||||
|
self.table_rows = []
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
if not data.strip() and not self.in_table:
|
||||||
|
return
|
||||||
|
|
||||||
|
# 如果在表格单元格中(只处理最外层表格)
|
||||||
|
if self.current_cell is not None and self.table_depth == 1:
|
||||||
|
# 保存文本及其格式
|
||||||
|
self.current_cell_content.append((data, self.in_bold, self.in_italic))
|
||||||
|
return
|
||||||
|
|
||||||
|
if not self.current_paragraph and not self.in_table:
|
||||||
|
self.current_paragraph = self.doc.add_paragraph()
|
||||||
|
|
||||||
|
if self.current_paragraph:
|
||||||
|
run = self.current_paragraph.add_run(data)
|
||||||
|
|
||||||
|
if self.in_bold:
|
||||||
|
run.bold = True
|
||||||
|
if self.in_italic:
|
||||||
|
run.italic = True
|
||||||
|
if self.in_code:
|
||||||
|
run.font.name = 'Courier New'
|
||||||
|
run.font.size = Pt(10)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_markdown_to_word(markdown_text):
|
||||||
|
"""将 Markdown 文本转换为 Word 文档并保存到 word_output 目录(使用纯 Python 实现)"""
|
||||||
|
if not markdown_text or markdown_text.strip() == "":
|
||||||
|
return "请输入 Markdown 内容!", None
|
||||||
|
|
||||||
|
# 确保输出目录存在
|
||||||
|
output_dir = "word_output"
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
|
||||||
|
# 生成文件名(使用时间戳)
|
||||||
|
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
docx_filename = f"document_{timestamp}.docx"
|
||||||
|
docx_path = os.path.join(output_dir, docx_filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 将 Markdown 转换为 HTML(启用表格扩展)
|
||||||
|
html_content = markdown.markdown(
|
||||||
|
markdown_text,
|
||||||
|
extensions=['extra', 'nl2br', 'tables']
|
||||||
|
)
|
||||||
|
|
||||||
|
# 创建 Word 文档
|
||||||
|
doc = Document()
|
||||||
|
|
||||||
|
# 解析 HTML 并添加到 Word 文档
|
||||||
|
parser = MarkdownToDocxParser(doc)
|
||||||
|
parser.feed(html_content)
|
||||||
|
|
||||||
|
# 保存文档
|
||||||
|
doc.save(docx_path)
|
||||||
|
message = f"转换成功!文件已保存为: {docx_filename}"
|
||||||
|
# 只返回本次生成的文件
|
||||||
|
return message, docx_path
|
||||||
|
except Exception as e:
|
||||||
|
message = f"转换失败: {e}"
|
||||||
|
return message, None
|
||||||
|
|
||||||
|
|
||||||
|
def get_word_files():
|
||||||
|
"""获取 word_output 目录下的所有 Word 文档"""
|
||||||
|
output_dir = "word_output"
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
return []
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for filename in os.listdir(output_dir):
|
||||||
|
if filename.endswith('.docx'):
|
||||||
|
file_path = os.path.join(output_dir, filename)
|
||||||
|
files.append(file_path)
|
||||||
|
|
||||||
|
# 按修改时间倒序排列
|
||||||
|
files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
def remove_headers(curl_command):
|
def remove_headers(curl_command):
|
||||||
# 使用正则表达式去除 if-none-match 和 range 标头
|
# 使用正则表达式去除 if-none-match 和 range 标头
|
||||||
curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command)
|
curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command)
|
||||||
@ -301,13 +545,23 @@ with gr.Blocks() as iface:
|
|||||||
with gr.Tab("Markdown 转 Word"):
|
with gr.Tab("Markdown 转 Word"):
|
||||||
gr.Markdown("## Markdown 转 Word 转换器")
|
gr.Markdown("## Markdown 转 Word 转换器")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
text_input = gr.Textbox(lines=10, placeholder="请在此输入 Markdown 内容...")
|
markdown_input = gr.Textbox(
|
||||||
|
lines=15,
|
||||||
|
placeholder="请在此输入 Markdown 内容...",
|
||||||
|
label="Markdown 内容"
|
||||||
|
)
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
convert_button = gr.Button("转换")
|
convert_md_button = gr.Button("转换为 Word", variant="primary")
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
output_file = gr.File(label="下载转换后的文件")
|
conversion_status = gr.Textbox(label="转换状态", interactive=False)
|
||||||
|
with gr.Row():
|
||||||
|
word_file_output = gr.File(label="生成的 Word 文档")
|
||||||
|
|
||||||
convert_button.click(convert_to_docx, inputs=text_input, outputs=output_file)
|
convert_md_button.click(
|
||||||
|
convert_markdown_to_word,
|
||||||
|
inputs=markdown_input,
|
||||||
|
outputs=[conversion_status, word_file_output]
|
||||||
|
)
|
||||||
|
|
||||||
with gr.Tab("下载pdf"):
|
with gr.Tab("下载pdf"):
|
||||||
gr.Markdown("## pdf 下载指令修复")
|
gr.Markdown("## pdf 下载指令修复")
|
||||||
@ -353,4 +607,4 @@ with gr.Blocks() as iface:
|
|||||||
timer = gr.Timer(1.0)
|
timer = gr.Timer(1.0)
|
||||||
timer.tick(fn=update_log_output, outputs=[log_output])
|
timer.tick(fn=update_log_output, outputs=[log_output])
|
||||||
|
|
||||||
iface.launch(server_name="0.0.0.0")
|
iface.launch(server_name="0.0.0.0", server_port=7861)
|
||||||
|
|||||||
@ -2,3 +2,5 @@ gradio==5.49.0
|
|||||||
openai==1.44.1
|
openai==1.44.1
|
||||||
python-dotenv~=1.0.1
|
python-dotenv~=1.0.1
|
||||||
pillow~=10.4.0
|
pillow~=10.4.0
|
||||||
|
python-docx~=1.1.0
|
||||||
|
markdown~=3.5
|
||||||
Loading…
x
Reference in New Issue
Block a user