Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add option to change CJK fonts to SourceHanSerif #337

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ EXPOSE 7860

ENV PYTHONUNBUFFERED=1
ADD "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf" /app
ADD "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc" /app
RUN apt-get update && \
apt-get install --no-install-recommends -y libgl1 && \
rm -rf /var/lib/apt/lists/* && uv pip install --system --no-cache huggingface-hub && \
Expand All @@ -16,4 +17,4 @@ COPY . .

RUN uv pip install --system --no-cache .

CMD ["pdf2zh", "-i"]
CMD ["pdf2zh", "-i"]
14 changes: 12 additions & 2 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
log = logging.getLogger(__name__)


shs_name = "shs"
noto_name = "noto"


class PDFConverterEx(PDFConverter):
def __init__(
self,
Expand Down Expand Up @@ -134,6 +138,7 @@ def __init__(
lang_out: str = "",
service: str = "",
resfont: str = "",
shs: Font = None,
noto: Font = None,
envs: Dict = None,
prompt: List = None,
Expand All @@ -144,6 +149,7 @@ def __init__(
self.thread = thread
self.layout = layout
self.resfont = resfont
self.shs = shs
self.noto = noto
self.translator: BaseTranslator = None
param = service.split(":", 1)
Expand Down Expand Up @@ -358,8 +364,10 @@ def worker(s: str): # 多线程翻译
############################################################
# C. 新文档排版
def raw_string(fcur: str, cstk: str): # 编码字符串
if fcur == 'noto':
if fcur == noto_name:
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
elif fcur == shs_name:
return "".join(["%04x" % self.shs.has_glyph(ord(c)) for c in cstk])
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
else:
Expand Down Expand Up @@ -403,8 +411,10 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
pass
if fcur_ is None:
fcur_ = self.resfont # 默认非拉丁字体
if fcur_ == 'noto':
if fcur_ == noto_name: # FIXME: change to CONST
adv = self.noto.char_lengths(ch, size)[0]
elif fcur_ == shs_name: # FIXME: change to CONST
adv = self.shs.char_lengths(ch, size)[0]
else:
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
ptr += 1
Expand Down
45 changes: 37 additions & 8 deletions pdf2zh/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io
import os
import sys
from tabnanny import verbose
import tempfile
import urllib.request
from asyncio import CancelledError
Expand All @@ -20,10 +21,14 @@
from pdfminer.pdfparser import PDFParser
from pymupdf import Document, Font

from pdf2zh.converter import TranslateConverter
from pdf2zh.converter import TranslateConverter, shs_name, noto_name
from pdf2zh.doclayout import DocLayoutModel
from pdf2zh.pdfinterp import PDFPageInterpreterEx


# FIXME
USE_SHS_FONT = True

model = DocLayoutModel.load_available()

resfont_map = {
Expand Down Expand Up @@ -85,6 +90,7 @@ def translate_patch(
lang_out: str = "",
service: str = "",
resfont: str = "",
shs: Font = None,
noto: Font = None,
callback: object = None,
cancellation_event: asyncio.Event = None,
Expand All @@ -102,6 +108,7 @@ def translate_patch(
lang_out,
service,
resfont,
shs,
noto,
kwarg.get("envs", {}),
kwarg.get("prompt", []),
Expand Down Expand Up @@ -183,11 +190,28 @@ def translate_stream(
):
font_list = [("tiro", None)]
noto = None
shs = None
ttf_path = None
if lang_out.lower() in resfont_map: # CJK
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
if not USE_SHS_FONT:
resfont = resfont_map[lang_out.lower()]
else:
resfont = shs_name
# docker
ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc")
if not os.path.exists(ttf_path):
ttf_path = os.path.join(
tempfile.gettempdir(), "SourceHanSerif-Medium.ttc"
)
if not os.path.exists(ttf_path):
print("Downloading SourceHanSerif font...")
urllib.request.urlretrieve(
"https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc",
ttf_path,
)
shs = Font(shs_name, ttf_path)
elif lang_out.lower() in noto_list: # noto
resfont = "noto"
resfont = noto_name
# docker
ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")

Expand All @@ -199,11 +223,10 @@ def translate_stream(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = Font("noto", ttf_path)
noto = Font(noto_name, ttf_path)
else: # fallback
resfont = "china-ss"
font_list.append(("china-ss", None))
font_list.append((resfont, ttf_path))

doc_en = Document(stream=stream)
stream = io.BytesIO()
Expand Down Expand Up @@ -233,6 +256,7 @@ def translate_stream(
pass

fp = io.BytesIO()

doc_zh.save(fp)
obj_patch: dict = translate_patch(fp, prompt=kwarg["prompt"], **locals())

Expand All @@ -247,7 +271,12 @@ def translate_stream(
for id in range(page_count):
doc_en.move_page(page_count + id, id * 2 + 1)

return doc_zh.write(deflate=1), doc_en.write(deflate=1)
doc_zh.subset_fonts(fallback=True)
doc_en.subset_fonts(fallback=True)
return (
doc_zh.write(deflate=True, garbage=3, use_objstms=1),
doc_en.write(deflate=True, garbage=3, use_objstms=1),
)


def convert_to_pdfa(input_path, output_path):
Expand Down
Loading