diff --git a/.gitignore b/.gitignore index 2eea525..7b4cc74 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.env \ No newline at end of file +.env +/word_output \ No newline at end of file diff --git a/README.md b/README.md index 7fe3652..2df01f9 100644 --- a/README.md +++ b/README.md @@ -1 +1,24 @@ -# chyoso-toolkit \ No newline at end of file +# chyoso-toolkit + +大模型工具集 + +## 安装依赖 + +```bash +pip install -r requirements.txt +``` + +## 功能列表 + +- 音频转写 +- 音频批量转写 +- Markdown 转 Word(纯 Python 实现,无需安装 pandoc) +- PDF 下载 +- 图片 OCR 识别 +- 批量图片识别 + +## 运行 + +```bash +python chyoso_toolkit_ui.py +``` \ No newline at end of file diff --git a/chyoso_toolkit_ui.py b/chyoso_toolkit_ui.py index 3d604e6..53e475d 100644 --- a/chyoso_toolkit_ui.py +++ b/chyoso_toolkit_ui.py @@ -5,9 +5,15 @@ import os import re import io import base64 +import datetime from dotenv import load_dotenv from openai import OpenAI from PIL import Image +from docx import Document +from docx.shared import Pt, Inches +from docx.enum.text import WD_PARAGRAPH_ALIGNMENT +import markdown +from html.parser import HTMLParser load_dotenv() @@ -143,6 +149,244 @@ def convert_to_docx(text): return "file.docx" +class MarkdownToDocxParser(HTMLParser): + """解析 HTML 并转换为 Word 文档""" + def __init__(self, document): + super().__init__() + self.doc = document + self.current_paragraph = None + self.current_run = None + self.in_bold = False + self.in_italic = False + self.in_code = False + self.in_heading = False + self.heading_level = 0 + self.list_items = [] + # 表格相关 + self.in_table = False + self.table_depth = 0 # 追踪表格嵌套深度 + self.current_table = None + self.current_row = None + self.current_cell = None + self.table_rows = [] + self.current_row_cells = [] + self.current_cell_content = [] # 存储单元格内容(包括格式) + self.is_header_row = False + + def handle_starttag(self, tag, attrs): + if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + self.in_heading = True + self.heading_level = int(tag[1]) + self.current_paragraph = self.doc.add_heading(level=self.heading_level) + self.current_paragraph.text = '' + elif tag == 'p': + if not self.in_table: # 不在表格中才创建段落 + self.current_paragraph = self.doc.add_paragraph() + elif self.current_cell is not None: + # 在表格单元格中,记录换行 + self.current_cell_content.append({'type': 'break'}) + elif tag == 'strong' or tag == 'b': + self.in_bold = True + elif tag == 'em' or tag == 'i': + self.in_italic = True + elif tag == 'code': + self.in_code = True + elif tag == 'li': + if not self.in_table: + self.current_paragraph = self.doc.add_paragraph(style='List Bullet') + elif tag == 'br': + if self.in_table and self.current_cell is not None: + self.current_cell_content.append({'type': 'break'}) + elif self.current_paragraph: + self.current_paragraph.add_run().add_break() + # 表格处理 + elif tag == 'table': + self.table_depth += 1 + if self.table_depth == 1: # 只处理最外层表格 + self.in_table = True + self.table_rows = [] + elif tag == 'thead': + if self.table_depth == 1: + self.is_header_row = True + elif tag == 'tbody': + if self.table_depth == 1: + self.is_header_row = False + elif tag == 'tr': + if self.table_depth == 1: + self.current_row_cells = [] + elif tag == 'th' or tag == 'td': + if self.table_depth == 1: + self.current_cell = [] + self.current_cell_content = [] + if tag == 'th': + self.in_bold = True # 表头加粗 + + def handle_endtag(self, tag): + if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + self.in_heading = False + self.heading_level = 0 + elif tag == 'p': + if not self.in_table: + self.current_paragraph = None + elif tag == 'strong' or tag == 'b': + self.in_bold = False + elif tag == 'em' or tag == 'i': + self.in_italic = False + elif tag == 'code': + self.in_code = False + elif tag == 'li': + if not self.in_table: + self.current_paragraph = None + # 表格处理 + elif tag == 'table': + self.table_depth -= 1 + if self.table_depth == 0: + self.in_table = False + self._create_table() + elif tag == 'thead': + if self.table_depth == 1: + self.is_header_row = False + elif tag == 'tr': + if self.table_depth == 1 and self.current_row_cells: + self.table_rows.append(self.current_row_cells) + self.current_row_cells = [] + elif tag == 'th' or tag == 'td': + if self.table_depth == 1 and self.current_cell is not None: + # 保存单元格内容(包含格式信息) + self.current_row_cells.append(self.current_cell_content.copy()) + self.current_cell = None + self.current_cell_content = [] + if tag == 'th': + self.in_bold = False + + def _create_table(self): + """创建 Word 表格""" + if not self.table_rows: + return + + # 计算列数 + max_cols = max(len(row) for row in self.table_rows) if self.table_rows else 0 + if max_cols == 0: + return + + # 创建表格 + table = self.doc.add_table(rows=len(self.table_rows), cols=max_cols) + table.style = 'Light Grid Accent 1' + + # 填充数据 + for i, row_data in enumerate(self.table_rows): + row = table.rows[i] + for j, cell_content_list in enumerate(row_data): + if j >= len(row.cells): + continue + cell = row.cells[j] + # 清空默认段落 + cell.text = '' + para = cell.paragraphs[0] if cell.paragraphs else cell.add_paragraph() + + # 处理单元格内容(支持格式和换行) + for content_item in cell_content_list: + if isinstance(content_item, dict): + if content_item.get('type') == 'break': + para.add_run().add_break() + else: + # 文本内容 + text, is_bold, is_italic = content_item + run = para.add_run(text) + if is_bold: + run.bold = True + if is_italic: + run.italic = True + + # 第一行加粗(表头) + if i == 0: + for paragraph in cell.paragraphs: + for run in paragraph.runs: + run.bold = True + + self.table_rows = [] + + def handle_data(self, data): + if not data.strip() and not self.in_table: + return + + # 如果在表格单元格中(只处理最外层表格) + if self.current_cell is not None and self.table_depth == 1: + # 保存文本及其格式 + self.current_cell_content.append((data, self.in_bold, self.in_italic)) + return + + if not self.current_paragraph and not self.in_table: + self.current_paragraph = self.doc.add_paragraph() + + if self.current_paragraph: + run = self.current_paragraph.add_run(data) + + if self.in_bold: + run.bold = True + if self.in_italic: + run.italic = True + if self.in_code: + run.font.name = 'Courier New' + run.font.size = Pt(10) + + +def convert_markdown_to_word(markdown_text): + """将 Markdown 文本转换为 Word 文档并保存到 word_output 目录(使用纯 Python 实现)""" + if not markdown_text or markdown_text.strip() == "": + return "请输入 Markdown 内容!", None + + # 确保输出目录存在 + output_dir = "word_output" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # 生成文件名(使用时间戳) + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + docx_filename = f"document_{timestamp}.docx" + docx_path = os.path.join(output_dir, docx_filename) + + try: + # 将 Markdown 转换为 HTML(启用表格扩展) + html_content = markdown.markdown( + markdown_text, + extensions=['extra', 'nl2br', 'tables'] + ) + + # 创建 Word 文档 + doc = Document() + + # 解析 HTML 并添加到 Word 文档 + parser = MarkdownToDocxParser(doc) + parser.feed(html_content) + + # 保存文档 + doc.save(docx_path) + message = f"转换成功!文件已保存为: {docx_filename}" + # 只返回本次生成的文件 + return message, docx_path + except Exception as e: + message = f"转换失败: {e}" + return message, None + + +def get_word_files(): + """获取 word_output 目录下的所有 Word 文档""" + output_dir = "word_output" + if not os.path.exists(output_dir): + return [] + + files = [] + for filename in os.listdir(output_dir): + if filename.endswith('.docx'): + file_path = os.path.join(output_dir, filename) + files.append(file_path) + + # 按修改时间倒序排列 + files.sort(key=lambda x: os.path.getmtime(x), reverse=True) + return files + + def remove_headers(curl_command): # 使用正则表达式去除 if-none-match 和 range 标头 curl_command = re.sub(r'(-H\s*\'if-none-match:[^\\]*\'\s*)', '', curl_command) @@ -301,13 +545,23 @@ with gr.Blocks() as iface: with gr.Tab("Markdown 转 Word"): gr.Markdown("## Markdown 转 Word 转换器") with gr.Row(): - text_input = gr.Textbox(lines=10, placeholder="请在此输入 Markdown 内容...") + markdown_input = gr.Textbox( + lines=15, + placeholder="请在此输入 Markdown 内容...", + label="Markdown 内容" + ) with gr.Row(): - convert_button = gr.Button("转换") + convert_md_button = gr.Button("转换为 Word", variant="primary") with gr.Row(): - output_file = gr.File(label="下载转换后的文件") - - convert_button.click(convert_to_docx, inputs=text_input, outputs=output_file) + conversion_status = gr.Textbox(label="转换状态", interactive=False) + with gr.Row(): + word_file_output = gr.File(label="生成的 Word 文档") + + convert_md_button.click( + convert_markdown_to_word, + inputs=markdown_input, + outputs=[conversion_status, word_file_output] + ) with gr.Tab("下载pdf"): gr.Markdown("## pdf 下载指令修复") @@ -353,4 +607,4 @@ with gr.Blocks() as iface: timer = gr.Timer(1.0) timer.tick(fn=update_log_output, outputs=[log_output]) -iface.launch(server_name="0.0.0.0") +iface.launch(server_name="0.0.0.0", server_port=7861) diff --git a/requirements.txt b/requirements.txt index 8a54e03..486a958 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,6 @@ gradio==5.49.0 openai==1.44.1 python-dotenv~=1.0.1 -pillow~=10.4.0 \ No newline at end of file +pillow~=10.4.0 +python-docx~=1.1.0 +markdown~=3.5 \ No newline at end of file