Skip to content

Commit

Permalink
Update to sentence spliting (#6)
Browse files Browse the repository at this point in the history
* Update to sentence spliting

* Fix CI

* Fix cloning
  • Loading branch information
WeberJulian authored Nov 16, 2023
1 parent 9994569 commit 09d28e7
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 94 deletions.
47 changes: 1 addition & 46 deletions .github/workflows/build-and-push-to-ghcr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ on:
branches: [main]
pull_request:
jobs:
build-and-push-to-ghcr-cuda117:
build-and-push-to-ghcr-cuda118:
runs-on: ubuntu-22.04
steps:
-
Expand Down Expand Up @@ -49,51 +49,6 @@ jobs:
tags: ghcr.io/coqui-ai/xtts-streaming-server:latest, ghcr.io/coqui-ai/xtts-streaming-server:main-${{ github.sha }}
#build-args:

build-and-push-to-ghcr-cuda118:
runs-on: ubuntu-22.04
steps:
-
name: Checkout
uses: actions/checkout@v3

-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: 'Login to GitHub Container Registry'
run: |
set -xe
docker login --username ${{ github.actor }} --password ${{ secrets.GITHUB_TOKEN }} ghcr.io
- name: 'Remove cache'
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
- name: Build only for PR cuda 11.8
if: github.ref != 'refs/heads/main'
uses: docker/build-push-action@v5
with:
context: "{{defaultContext}}:server"
file: Dockerfile.cuda118
push: false # Do not push image for PR
cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118; type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}
cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-pr-cuda118-${{ github.event.number }}

- name: Build and Push image cuda 11.8
if: github.ref == 'refs/heads/main'
uses: docker/build-push-action@v5
with:
context: "{{defaultContext}}:server"
file: Dockerfile.cuda118
push: true # Push if merged
cache-from: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
cache-to: type=registry,ref=ghcr.io/coqui-ai/xtts-streaming-server:cache-latest-cuda118
tags: ghcr.io/coqui-ai/xtts-streaming-server:latest-cuda118, ghcr.io/coqui-ai/xtts-streaming-server:main-cuda118-${{ github.sha }}
#build-args:

build-and-push-to-ghcr-cuda121:
runs-on: ubuntu-22.04
steps:
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,16 @@ $ python test_streaming.py

## Building the container

1. To build the Docker container (Pytorch 2.01 Cuda 11.7) :
1. To build the Docker container Pytorch 2.1 and CUDA 11.8 :

```bash
$ cd server
$ docker build -t xtts-stream .
```
For Pytorch 2.1 and CUDA 11.8 version (when running set NVIDIA_DISABLE_REQUIRE=1 if you have Cuda < 11.8 drivers)
For Pytorch 2.1 and CUDA 12.1 :
```bash
$ cd server
# docker build -t xtts-stream . -f Dockerfile.cuda118
docker build -t xtts-stream . -f Dockerfile.cuda121
```
2. Run the server container:

Expand Down
3 changes: 2 additions & 1 deletion server/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-devel
FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-devel
ARG DEBIAN_FRONTEND=noninteractive

RUN apt-get update && \
Expand All @@ -13,6 +13,7 @@ RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
RUN python -m unidic download

COPY main.py .
ENV NVIDIA_DISABLE_REQUIRE=1

ENV NUM_THREADS=2
EXPOSE 80
Expand Down
22 changes: 0 additions & 22 deletions server/Dockerfile.cuda118

This file was deleted.

21 changes: 13 additions & 8 deletions server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def predict_speaker(wav_file: UploadFile):
temp_audio_name = next(tempfile._get_candidate_names())
with open(temp_audio_name, "wb") as temp, torch.inference_mode():
temp.write(io.BytesIO(wav_file.file.read()).getbuffer())
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
temp_audio_name
)
return {
Expand Down Expand Up @@ -110,12 +110,13 @@ class StreamingInputs(BaseModel):
"nl",
"cs",
"ar",
"zh-cn",
"zh",
"ja",
"hu",
"ko",
]
add_wav_header: bool = True
stream_chunk_size: str = "20"
decoder: str = "ne_hifigan"


def predict_streaming_generator(parsed_input: dict = Body(...)):
Expand All @@ -127,16 +128,20 @@ def predict_streaming_generator(parsed_input: dict = Body(...)):
)
text = parsed_input.text
language = parsed_input.language
decoder = parsed_input.decoder

if decoder not in ["ne_hifigan","hifigan"]:
decoder = "ne_hifigan"

stream_chunk_size = int(parsed_input.stream_chunk_size)
add_wav_header = parsed_input.add_wav_header


chunks = model.inference_stream(text, language, gpt_cond_latent, speaker_embedding, decoder=decoder,stream_chunk_size=stream_chunk_size)
chunks = model.inference_stream(
text,
language,
gpt_cond_latent,
speaker_embedding,
stream_chunk_size=stream_chunk_size,
enable_text_splitting=True
)

for i, chunk in enumerate(chunks):
chunk = postprocess(chunk)
if i == 0 and add_wav_header:
Expand Down
2 changes: 1 addition & 1 deletion server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
TTS==0.20.2
TTS @ git+https://github.com/coqui-ai/TTS@sentence_spliting
uvicorn[standard]==0.23.2
fastapi==0.95.2
deepspeed==0.10.3
Expand Down
27 changes: 14 additions & 13 deletions test/test_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,10 @@ def stream_ffplay(audio_stream, output_file, save=True):
ffplay_proc.wait()


def tts(text, speaker,language, server_url , decoder, stream_chunk_size) -> Iterator[bytes]:
def tts(text, speaker, language, server_url, stream_chunk_size) -> Iterator[bytes]:
start = time.perf_counter()
speaker["text"] = text
speaker["language"] = language
speaker["decoder"] = decoder # "hifigan" or "ne_hifigan" for TTS>0.19.0
speaker["stream_chunk_size"] = stream_chunk_size # you can reduce it to get faster response, but degrade quality
res = requests.post(
f"{server_url}/tts_stream",
Expand Down Expand Up @@ -86,7 +85,6 @@ def get_speaker(ref_audio,server_url):
default="en",
help="Language to use default is 'en' (English)"
)

parser.add_argument(
"--output_file",
default=None,
Expand All @@ -102,25 +100,28 @@ def get_speaker(ref_audio,server_url):
default="http://localhost:8000",
help="Server url http://localhost:8000 default, change to your server location "
)
parser.add_argument(
"--decoder",
default="ne_hifigan",
help="Decoder for vocoder, ne_hifigan default, options ne_hifigan or hifigan"
)

parser.add_argument(
"--stream_chunk_size",
default="20",
help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality"
)

args = parser.parse_args()

with open("./default_speaker.json", "r") as file:
speaker = json.load(file)

if args.ref_file is not None:
print("Computing the latents for a new reference...")
speaker = get_speaker(args.ref_file,args.server_url)

audio = stream_ffplay(tts(args.text, speaker,args.language,args.server_url,args.decoder,args.stream_chunk_size), args.output_file, save=bool(args.output_file))
speaker = get_speaker(args.ref_file, args.server_url)

audio = stream_ffplay(
tts(
args.text,
speaker,
args.language,
args.server_url,
args.stream_chunk_size
),
args.output_file,
save=bool(args.output_file)
)

0 comments on commit 09d28e7

Please sign in to comment.