-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #971 from LollipopsAndWine/dev
- Loading branch information
Showing
10 changed files
with
294 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
Empty file.
36 changes: 36 additions & 0 deletions
36
projects/web_demo/web_demo/common/mk_markdown/libs/language.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import os | ||
import unicodedata | ||
|
||
if not os.getenv("FTLANG_CACHE"): | ||
current_file_path = os.path.abspath(__file__) | ||
current_dir = os.path.dirname(current_file_path) | ||
root_dir = os.path.dirname(current_dir) | ||
ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect') | ||
os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir) | ||
# print(os.getenv("FTLANG_CACHE")) | ||
|
||
from fast_langdetect import detect_language | ||
|
||
|
||
def detect_lang(text: str) -> str: | ||
|
||
if len(text) == 0: | ||
return "" | ||
try: | ||
lang_upper = detect_language(text) | ||
except: | ||
html_no_ctrl_chars = ''.join([l for l in text if unicodedata.category(l)[0] not in ['C', ]]) | ||
lang_upper = detect_language(html_no_ctrl_chars) | ||
try: | ||
lang = lang_upper.lower() | ||
except: | ||
lang = "" | ||
return lang | ||
|
||
|
||
if __name__ == '__main__': | ||
print(os.getenv("FTLANG_CACHE")) | ||
print(detect_lang("This is a test.")) | ||
print(detect_lang("<html>This is a test</html>")) | ||
print(detect_lang("这个是中文测试。")) | ||
print(detect_lang("<html>这个是中文测试。</html>")) |
31 changes: 31 additions & 0 deletions
31
projects/web_demo/web_demo/common/mk_markdown/libs/markdown_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import re | ||
|
||
|
||
def escape_special_markdown_char(pymu_blocks): | ||
""" | ||
转义正文里对markdown语法有特殊意义的字符 | ||
""" | ||
special_chars = ["*", "`", "~", "$"] | ||
for blk in pymu_blocks: | ||
for line in blk['lines']: | ||
for span in line['spans']: | ||
for char in special_chars: | ||
span_text = span['text'] | ||
span_type = span.get("_type", None) | ||
if span_type in ['inline-equation', 'interline-equation']: | ||
continue | ||
elif span_text: | ||
span['text'] = span['text'].replace(char, "\\" + char) | ||
|
||
return pymu_blocks | ||
|
||
|
||
def ocr_escape_special_markdown_char(content): | ||
""" | ||
转义正文里对markdown语法有特殊意义的字符 | ||
""" | ||
special_chars = ["*", "`", "~", "$"] | ||
for char in special_chars: | ||
content = content.replace(char, "\\" + char) | ||
|
||
return content |
38 changes: 38 additions & 0 deletions
38
projects/web_demo/web_demo/common/mk_markdown/libs/ocr_content_type.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
class ContentType: | ||
Image = 'image' | ||
Table = 'table' | ||
Text = 'text' | ||
InlineEquation = 'inline_equation' | ||
InterlineEquation = 'interline_equation' | ||
|
||
|
||
class BlockType: | ||
Image = 'image' | ||
ImageBody = 'image_body' | ||
ImageCaption = 'image_caption' | ||
ImageFootnote = 'image_footnote' | ||
Table = 'table' | ||
TableBody = 'table_body' | ||
TableCaption = 'table_caption' | ||
TableFootnote = 'table_footnote' | ||
Text = 'text' | ||
Title = 'title' | ||
InterlineEquation = 'interline_equation' | ||
Footnote = 'footnote' | ||
Discarded = 'discarded' | ||
|
||
|
||
class CategoryId: | ||
Title = 0 | ||
Text = 1 | ||
Abandon = 2 | ||
ImageBody = 3 | ||
ImageCaption = 4 | ||
TableBody = 5 | ||
TableCaption = 6 | ||
TableFootnote = 7 | ||
InterlineEquation_Layout = 8 | ||
InlineEquation = 13 | ||
InterlineEquation_YOLO = 14 | ||
OcrText = 15 | ||
ImageFootnote = 101 |
169 changes: 169 additions & 0 deletions
169
projects/web_demo/web_demo/common/mk_markdown/mk_markdown.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
import re | ||
import wordninja | ||
from .libs.language import detect_lang | ||
from .libs.markdown_utils import ocr_escape_special_markdown_char | ||
from .libs.ocr_content_type import BlockType, ContentType | ||
|
||
|
||
def __is_hyphen_at_line_end(line): | ||
""" | ||
Check if a line ends with one or more letters followed by a hyphen. | ||
Args: | ||
line (str): The line of text to check. | ||
Returns: | ||
bool: True if the line ends with one or more letters followed by a hyphen, False otherwise. | ||
""" | ||
# Use regex to check if the line ends with one or more letters followed by a hyphen | ||
return bool(re.search(r'[A-Za-z]+-\s*$', line)) | ||
|
||
|
||
def split_long_words(text): | ||
segments = text.split(' ') | ||
for i in range(len(segments)): | ||
words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE) | ||
for j in range(len(words)): | ||
if len(words[j]) > 10: | ||
words[j] = ' '.join(wordninja.split(words[j])) | ||
segments[i] = ''.join(words) | ||
return ' '.join(segments) | ||
|
||
|
||
def join_path(*args): | ||
return ''.join(str(s).rstrip('/') for s in args) | ||
|
||
|
||
def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, | ||
img_buket_path): | ||
markdown_with_para_and_pagination = [] | ||
page_no = 0 | ||
for page_info in pdf_info_dict: | ||
paras_of_layout = page_info.get('para_blocks') | ||
if not paras_of_layout: | ||
continue | ||
page_markdown = ocr_mk_markdown_with_para_core_v2( | ||
paras_of_layout, 'mm', img_buket_path) | ||
markdown_with_para_and_pagination.append({ | ||
'page_no': | ||
page_no, | ||
'md_content': | ||
'\n\n'.join(page_markdown) | ||
}) | ||
page_no += 1 | ||
return markdown_with_para_and_pagination | ||
|
||
|
||
def merge_para_with_text(para_block): | ||
def detect_language(text): | ||
en_pattern = r'[a-zA-Z]+' | ||
en_matches = re.findall(en_pattern, text) | ||
en_length = sum(len(match) for match in en_matches) | ||
if len(text) > 0: | ||
if en_length / len(text) >= 0.5: | ||
return 'en' | ||
else: | ||
return 'unknown' | ||
else: | ||
return 'empty' | ||
|
||
para_text = '' | ||
for line in para_block['lines']: | ||
line_text = '' | ||
line_lang = '' | ||
for span in line['spans']: | ||
span_type = span['type'] | ||
if span_type == ContentType.Text: | ||
line_text += span['content'].strip() | ||
if line_text != '': | ||
line_lang = detect_lang(line_text) | ||
for span in line['spans']: | ||
span_type = span['type'] | ||
content = '' | ||
if span_type == ContentType.Text: | ||
content = span['content'] | ||
# language = detect_lang(content) | ||
language = detect_language(content) | ||
if language == 'en': # 只对英文长词进行分词处理,中文分词会丢失文本 | ||
content = ocr_escape_special_markdown_char( | ||
split_long_words(content)) | ||
else: | ||
content = ocr_escape_special_markdown_char(content) | ||
elif span_type == ContentType.InlineEquation: | ||
content = f" ${span['content']}$ " | ||
elif span_type == ContentType.InterlineEquation: | ||
content = f"\n$$\n{span['content']}\n$$\n" | ||
|
||
if content != '': | ||
langs = ['zh', 'ja', 'ko'] | ||
if line_lang in langs: # 遇到一些一个字一个span的文档,这种单字语言判断不准,需要用整行文本判断 | ||
para_text += content # 中文/日语/韩文语境下,content间不需要空格分隔 | ||
elif line_lang == 'en': | ||
# 如果是前一行带有-连字符,那么末尾不应该加空格 | ||
if __is_hyphen_at_line_end(content): | ||
para_text += content[:-1] | ||
else: | ||
para_text += content + ' ' | ||
else: | ||
para_text += content + ' ' # 西方文本语境下 content间需要空格分隔 | ||
return para_text | ||
|
||
|
||
def ocr_mk_markdown_with_para_core_v2(paras_of_layout, | ||
mode, | ||
img_buket_path=''): | ||
page_markdown = [] | ||
for para_block in paras_of_layout: | ||
para_text = '' | ||
para_type = para_block['type'] | ||
if para_type == BlockType.Text: | ||
para_text = merge_para_with_text(para_block) | ||
elif para_type == BlockType.Title: | ||
para_text = f'# {merge_para_with_text(para_block)}' | ||
elif para_type == BlockType.InterlineEquation: | ||
para_text = merge_para_with_text(para_block) | ||
elif para_type == BlockType.Image: | ||
if mode == 'nlp': | ||
continue | ||
elif mode == 'mm': | ||
for block in para_block['blocks']: # 1st.拼image_body | ||
if block['type'] == BlockType.ImageBody: | ||
for line in block['lines']: | ||
for span in line['spans']: | ||
if span['type'] == ContentType.Image: | ||
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" | ||
for block in para_block['blocks']: # 2nd.拼image_caption | ||
if block['type'] == BlockType.ImageCaption: | ||
para_text += merge_para_with_text(block) | ||
for block in para_block['blocks']: # 2nd.拼image_caption | ||
if block['type'] == BlockType.ImageFootnote: | ||
para_text += merge_para_with_text(block) | ||
elif para_type == BlockType.Table: | ||
if mode == 'nlp': | ||
continue | ||
elif mode == 'mm': | ||
for block in para_block['blocks']: # 1st.拼table_caption | ||
if block['type'] == BlockType.TableCaption: | ||
para_text += merge_para_with_text(block) | ||
for block in para_block['blocks']: # 2nd.拼table_body | ||
if block['type'] == BlockType.TableBody: | ||
for line in block['lines']: | ||
for span in line['spans']: | ||
if span['type'] == ContentType.Table: | ||
# if processed by table model | ||
if span.get('latex', ''): | ||
para_text += f"\n\n$\n {span['latex']}\n$\n\n" | ||
elif span.get('html', ''): | ||
para_text += f"\n\n{span['html']}\n\n" | ||
else: | ||
para_text += f"\n![]({join_path(img_buket_path, span['image_path'])}) \n" | ||
for block in para_block['blocks']: # 3rd.拼table_footnote | ||
if block['type'] == BlockType.TableFootnote: | ||
para_text += merge_para_with_text(block) | ||
|
||
if para_text.strip() == '': | ||
continue | ||
else: | ||
page_markdown.append(para_text.strip() + ' ') | ||
|
||
return page_markdown |
Binary file added
BIN
+916 KB
projects/web_demo/web_demo/common/mk_markdown/resources/fasttext-langdetect/lid.176.ftz
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters