opendatalab · myhloli · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/README.md b/README.md
@@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 - Preserve the structure of the original document, including headings, paragraphs, lists, etc.
 - Extract images, image descriptions, tables, table titles, and footnotes.
 - Automatically recognize and convert formulas in the document to LaTeX format.
-- Automatically recognize and convert tables in the document to LaTeX or HTML format.
+- Automatically recognize and convert tables in the document to HTML format.
 - Automatically detect scanned PDFs and garbled PDFs and enable OCR functionality.
 - OCR supports detection and recognition of 84 languages.
 - Supports multiple output formats, such as multimodal and NLP Markdown, JSON sorted by reading order, and rich intermediate formats.
@@ -185,17 +185,11 @@ There are three different ways to experience MinerU:
     </tr>
     <tr>
         <td rowspan="2">GPU Hardware Support List</td>
-        <td colspan="2">Minimum Requirement 8G+ VRAM</td>
-        <td colspan="2">3060ti/3070/4060<br>
-        8G VRAM enables layout, formula recognition acceleration and OCR acceleration</td>
+        <td colspan="2">GPU VRAM 8GB or more</td>
+        <td colspan="2">2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
+        8G VRAM can enable all acceleration features</td>
         <td rowspan="2">None</td>
     </tr>
-    <tr>
-        <td colspan="2">Recommended Configuration 10G+ VRAM</td>
-        <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
-        10G VRAM or more can enable layout, formula recognition, OCR acceleration and table recognition acceleration simultaneously
-        </td>
-    </tr>
 </table>
 
 ### Online Demo
@@ -247,7 +241,7 @@ You can modify certain configurations in this file to enable or disable features
         "enable": true  // The formula recognition feature is enabled by default. If you need to disable it, please change the value here to "false".
     },
     "table-config": {
-        "model": "rapid_table",  // When using structEqTable, please change to "struct_eqtable".
+        "model": "rapid_table",  // Default to using "rapid_table", can be switched to "tablemaster" or "struct_eqtable".
         "enable": false, // The table recognition feature is disabled by default. If you need to enable it, please change the value here to "true".
         "max_time": 400
     }

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -121,7 +121,7 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
 - 保留原文档的结构，包括标题、段落、列表等
 - 提取图像、图片描述、表格、表格标题及脚注
 - 自动识别并转换文档中的公式为LaTeX格式
-- 自动识别并转换文档中的表格为LaTeX或HTML格式
+- 自动识别并转换文档中的表格为HTML格式
 - 自动检测扫描版PDF和乱码PDF，并启用OCR功能
 - OCR支持84种语言的检测与识别
 - 支持多种输出格式，如多模态与NLP的Markdown、按阅读顺序排序的JSON、含有丰富信息的中间格式等
@@ -186,17 +186,13 @@ https://github.com/user-attachments/assets/4bea02c9-6d54-4cd6-97ed-dff14340982c
     </tr>
     <tr>
         <td rowspan="2">GPU硬件支持列表</td>
-        <td colspan="2">最低要求 8G+显存</td>
-        <td colspan="2">3060ti/3070/4060<br>
-        8G显存可开启全部加速功能(表格仅限rapid_table)</td>
+        <td colspan="2">显存8G以上</td>
+        <td colspan="2">
+        2080~2080Ti / 3060Ti~3090Ti / 4060~4090<br>
+        8G显存及以上可开启全部加速功能</td>
         <td rowspan="2">None</td>
     </tr>
-    <tr>
-        <td colspan="2">推荐配置 10G+显存</td>
-        <td colspan="2">3080/3080ti/3090/3090ti/4070/4070ti/4070tisuper/4080/4090<br>
-        10G显存及以上可开启全部加速功能<br>
-        </td>
-    </tr>
+
 </table>
 
 ### 在线体验
@@ -251,7 +247,7 @@ pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com -i h
         "enable": true  // 公式识别功能默认是开启的，如果需要关闭请修改此处的值为"false"
     },
     "table-config": {
-        "model": "rapid_table",  // 使用structEqTable请修改为"struct_eqtable"
+        "model": "rapid_table",  // 默认使用"rapid_table",可以切换为"tablemaster"和"struct_eqtable"
         "enable": false, // 表格识别功能默认是关闭的，如果需要开启请修改此处的值为"true"
         "max_time": 400
     }

diff --git a/projects/web_demo/web_demo/api/analysis/pdf_ext.py b/projects/web_demo/web_demo/api/analysis/pdf_ext.py
@@ -1,13 +1,15 @@
 import json
 import re
+import os
+import shutil
 import traceback
 from pathlib import Path
 from flask import current_app, url_for
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.pipe.UNIPipe import UNIPipe
 import magic_pdf.model as model_config
 from magic_pdf.libs.json_compressor import JsonCompressor
-from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para_and_pagination
+from common.mk_markdown.mk_markdown import ocr_mk_mm_markdown_with_para_and_pagination
 from .ext import find_file
 from ..extentions import app, db
 from .models import AnalysisPdf, AnalysisTask
@@ -17,7 +19,7 @@
 model_config.__use_inside_model__ = True
 
 
-def analysis_pdf(image_dir, pdf_bytes, is_ocr=False):
+def analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr=False):
     try:
         model_json = []  # model_json传空list使用内置模型解析
         logger.info(f"is_ocr: {is_ocr}")
@@ -40,7 +42,7 @@ def analysis_pdf(image_dir, pdf_bytes, is_ocr=False):
         pipe.pipe_parse()
         pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
         pdf_info_list = pdf_mid_data["pdf_info"]
-        md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_dir),
+        md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_list, image_url_prefix),
                                 ensure_ascii=False)
         bbox_info = get_bbox_info(pdf_info_list)
         return md_content, bbox_info
@@ -77,20 +79,22 @@ def analysis_pdf_task(pdf_dir, image_dir, pdf_path, is_ocr, analysis_pdf_id):
         logger.info(f"image_dir: {image_dir}")
         if not Path(image_dir).exists():
             Path(image_dir).mkdir(parents=True, exist_ok=True)
+        else:
+            # 清空image_dir，避免同文件多次解析图片积累
+            shutil.rmtree(image_dir, ignore_errors=True)
+            os.makedirs(image_dir, exist_ok=True)
+
+        # 获取文件内容
         with open(pdf_path, 'rb') as file:
             pdf_bytes = file.read()
-        md_content, bbox_info = analysis_pdf(image_dir, pdf_bytes, is_ocr)
-        img_list = Path(image_dir).glob('*') if Path(image_dir).exists() else []
+        # 生成图片链接
+        with app.app_context():
+            image_url_prefix = f"http://{current_app.config['SERVER_NAME']}{current_app.config['FILE_API']}&pdf={Path(pdf_path).name}&filename="
+        # 解析文件
+        md_content, bbox_info = analysis_pdf(image_url_prefix, image_dir, pdf_bytes, is_ocr)
 
+        # ############ markdown #############
         pdf_name = Path(pdf_path).name
-        with app.app_context():
-            for img in img_list:
-                img_name = Path(img).name
-                regex = re.compile(fr'.*\((.*?{img_name})')
-                regex_result = regex.search(md_content)
-                if regex_result:
-                    img_url = url_for('analysis.imgview', filename=img_name, as_attachment=False)
-                    md_content = md_content.replace(regex_result.group(1), f"{img_url}&pdf={pdf_name}")
 
         full_md_content = ""
         for item in json.loads(md_content):

diff --git a/projects/web_demo/web_demo/app.py b/projects/web_demo/web_demo/app.py
@@ -42,7 +42,7 @@ class ConfigMap(dict):
 ip_address = get_local_ip()
 port = config.get("PORT", 5559)
 # 配置 SERVER_NAME
-config['SERVER_NAME'] = f'{ip_address}:5559'
+config['SERVER_NAME'] = f'{ip_address}:{port}'
 # 配置 APPLICATION_ROOT
 config['APPLICATION_ROOT'] = '/'
 # 配置 PREFERRED_URL_SCHEME

diff --git a/projects/web_demo/web_demo/common/mk_markdown/__init__.py b/projects/web_demo/web_demo/common/mk_markdown/__init__.py
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py b/projects/web_demo/web_demo/common/mk_markdown/libs/__init__.py
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/language.py b/projects/web_demo/web_demo/common/mk_markdown/libs/language.py
@@ -0,0 +1,36 @@
+import os
+import unicodedata
+
+if not os.getenv("FTLANG_CACHE"):
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    root_dir = os.path.dirname(current_dir)
+    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
+    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
+    # print(os.getenv("FTLANG_CACHE"))
+
+from fast_langdetect import detect_language
+
+
+def detect_lang(text: str) -> str:
+
+    if len(text) == 0:
+        return ""
+    try:
+        lang_upper = detect_language(text)
+    except:
+        html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]])
+        lang_upper = detect_language(html_no_ctrl_chars)
+    try:
+        lang = lang_upper.lower()
+    except:
+        lang = ""
+    return lang
+
+
+if __name__ == '__main__':
+    print(os.getenv("FTLANG_CACHE"))
+    print(detect_lang("This is a test."))
+    print(detect_lang("<html>This is a test</html>"))
+    print(detect_lang("这个是中文测试。"))
+    print(detect_lang("<html>这个是中文测试。</html>"))
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py b/projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
@@ -0,0 +1,31 @@
+import re
+
+
+def escape_special_markdown_char(pymu_blocks):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for blk in pymu_blocks:
+        for line in blk['lines']:
+            for span in line['spans']:
+                for char in special_chars:
+                    span_text = span['text']
+                    span_type = span.get("_type", None)
+                    if span_type in ['inline-equation', 'interline-equation']:
+                        continue
+                    elif span_text:
+                        span['text'] = span['text'].replace(char, "\\" + char)
+
+    return pymu_blocks
+
+
+def ocr_escape_special_markdown_char(content):
+    """
+    转义正文里对markdown语法有特殊意义的字符
+    """
+    special_chars = ["*", "`", "~", "$"]
+    for char in special_chars:
+        content = content.replace(char, "\\" + char)
+
+    return content
diff --git a/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py b/projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
@@ -0,0 +1,38 @@
+class ContentType:
+    Image = 'image'
+    Table = 'table'
+    Text = 'text'
+    InlineEquation = 'inline_equation'
+    InterlineEquation = 'interline_equation'
+
+
+class BlockType:
+    Image = 'image'
+    ImageBody = 'image_body'
+    ImageCaption = 'image_caption'
+    ImageFootnote = 'image_footnote'
+    Table = 'table'
+    TableBody = 'table_body'
+    TableCaption = 'table_caption'
+    TableFootnote = 'table_footnote'
+    Text = 'text'
+    Title = 'title'
+    InterlineEquation = 'interline_equation'
+    Footnote = 'footnote'
+    Discarded = 'discarded'
+
+
+class CategoryId:
+    Title = 0
+    Text = 1
+    Abandon = 2
+    ImageBody = 3
+    ImageCaption = 4
+    TableBody = 5
+    TableCaption = 6
+    TableFootnote = 7
+    InterlineEquation_Layout = 8
+    InlineEquation = 13
+    InterlineEquation_YOLO = 14
+    OcrText = 15
+    ImageFootnote = 101