From 3f72706e241266b09f50530d3ad888a517d4a55d Mon Sep 17 00:00:00 2001
From: William Jin <jinwei@gmail.com>
Date: Tue, 15 Oct 2024 10:07:46 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E6=89=B9=E9=87=8F=E5=9B=BE?=
 =?UTF-8?q?=E7=89=87=E8=AF=86=E5=88=AB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore           |  1 +
 chyoso_toolkit_ui.py | 26 +++++++++++++++++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2eea525
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.env
\ No newline at end of file
diff --git a/chyoso_toolkit_ui.py b/chyoso_toolkit_ui.py
index bd40b0d..2d8c755 100644
--- a/chyoso_toolkit_ui.py
+++ b/chyoso_toolkit_ui.py
@@ -1,3 +1,4 @@
+import PIL
 import gradio as gr
 import subprocess
 import os
@@ -199,18 +200,27 @@ def run_ocr(image):
     return content
 
 
-def ocr_image(image):
-    # 将图片转换为 base64 编码
+def ocr_image(image, user_query=""):
+    # 压缩图片并降低分辨率
+    max_size = 1600
+    aspect_ratio = image.width / image.height
+    if aspect_ratio > 1:  # 宽度大于高度
+        new_size = (max_size, int(max_size / aspect_ratio))
+    else:  # 高度大于或等于宽度
+        new_size = (int(max_size * aspect_ratio), max_size)
+    compressed_image = image.resize(new_size, PIL.Image.Resampling.LANCZOS)
     buffered = io.BytesIO()
-    image.save(buffered, format="JPEG")
+
+    compressed_image.save(buffered, format="JPEG", quality=75)  # 设置JPEG质量为85
     image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    print(f"Base64 字符串长度: {len(image_base64) / 1024:.2f} k")
     messages = [
         {
             "role": "user",
             "content": [
                 {
                     "type": "text",
-                    "text": "识别图片中的文字并以纯文本(txt)格式输出"
+                    "text": "识别图片中的文字并以纯文本(txt)格式输出，如果图片分为左右两栏，则先输出左边栏再输出右边栏的内容。" + user_query
                 },
                 {
                     "type": "image_url",
@@ -234,12 +244,12 @@ def ocr_image(image):
     return chat_completion.choices[0].message.content
 
 
-def batch_ocr(files):
+def batch_ocr(files, user_query):
     results = []
     for file in files:
         try:
             image = Image.open(file)
-            ocr_result = ocr_image(image)
+            ocr_result = ocr_image(image, user_query)
             results.append(f"\n\n{ocr_result}\n\n")
         except Exception as e:
             print(e)
@@ -334,9 +344,11 @@ with gr.Blocks() as iface:
                     output_file = gr.File(label="Download OCR Results")
             with gr.Row():
                 process_button = gr.Button("Process Images")
+            with gr.Row():
+                user_query_text = gr.Textbox(placeholder="输入额外的要求")
 
             # 绑定按钮点击事件
-            process_button.click(batch_ocr, inputs=input_files, outputs=output_file)
+            process_button.click(batch_ocr, inputs=[input_files, user_query_text], outputs=output_file)
 
     iface.load(fn=update_log_output, outputs=[log_output], every=1)