Skip to content

Commit

Permalink
Apply #803
Browse files Browse the repository at this point in the history
  • Loading branch information
davidmezzetti committed Oct 25, 2024
1 parent 72c68c9 commit 5b27c8a
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
- name: Build
run: |
pip install -U wheel
pip install .[all,dev] fasttext==0.9.2
pip install .[all,dev] fasttext==0.9.2 pillow==10.4.0
python -c "import nltk; nltk.download(['punkt', 'punkt_tab', 'averaged_perceptron_tagger_eng'])"
python --version
make data coverage
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@

setup(
name="txtai",
version="7.5.0",
version="7.5.1",
author="NeuML",
description="All-in-one open-source embeddings database for semantic search, LLM orchestration and language model workflows",
long_description=DESCRIPTION,
Expand Down
28 changes: 25 additions & 3 deletions src/python/txtai/pipeline/text/translation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
Translation module
"""

import os

# Conditional import
try:
import fasttext
Expand All @@ -10,7 +12,7 @@
except ImportError:
FASTTEXT = False

from huggingface_hub import cached_download
from huggingface_hub import hf_hub_download
from huggingface_hub.hf_api import HfApi
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

Expand All @@ -24,7 +26,7 @@ class Translation(HFModel):
"""

# Default language detection model
DEFAULT_LANG_DETECT = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
DEFAULT_LANG_DETECT = "julien-c/fasttext-language-id/lid.176.ftz"

def __init__(self, path=None, quantize=False, gpu=True, batch=64, langdetect=None, findmodels=True):
"""
Expand Down Expand Up @@ -155,14 +157,34 @@ def defaultdetect(self, texts):
path = self.langdetect if self.langdetect else Translation.DEFAULT_LANG_DETECT

# Load language detection model
path = cached_download(path, legacy_cache_layout=True)
path = path if os.path.exists(path) else self.download(path)
self.detector = fasttext.load_model(path)

# Transform texts to format expected by language detection model
texts = [x.lower().replace("\n", " ").replace("\r\n", " ") for x in texts]

return [x[0].split("__")[-1] for x in self.detector.predict(texts)[0]]

def download(self, path):
"""
Downloads path from the Hugging Face Hub.
Args:
path: full model path
Returns:
local cached model path
"""

# Split into parts
parts = path.split("/")

# Calculate repo id split
repo = 2 if len(parts) > 2 else 1

# Download and cache file
return hf_hub_download(repo_id="/".join(parts[:repo]), filename="/".join(parts[repo:]))

def translate(self, texts, source, target, showmodels=False):
"""
Translates text from source to target language.
Expand Down
2 changes: 1 addition & 1 deletion src/python/txtai/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"""

# Current version tag
__version__ = "7.5.0"
__version__ = "7.5.1"

0 comments on commit 5b27c8a

Please sign in to comment.