forked from srvk/srvk-eesen-offline-transcriber
-
Notifications
You must be signed in to change notification settings - Fork 0
/
align.sh
executable file
·104 lines (82 loc) · 3.58 KB
/
align.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/bin/bash
# Copyright 2016 er1k
# Apache 2.0
# Prepare data for, and run align_ctc_utts.sh script that generates word-level alignments
# in an "Eesen Transccriber-centric" way output is found in build/output/<basename>.ali
# Required inputs:
#
# * a 'hypothesis' text file for which to compute alignments, extension .txt
# one utterance per line. If no hypothesis text is found, text
# is obtained from the STM file below
# * an STM file with utterance/segment timings - 'perfect' transcription
# * an audio file, extension can vary (.mp3, .wav, .mp4 etc)
BASEDIR=$(dirname $0)
EESEN_ROOT=~/eesen
# Change these if you're using different models
GRAPH_DIR=$EESEN_ROOT/asr_egs/tedlium/v2-30ms/data/lang_phn_test_test_newlm
MODEL_DIR=$EESEN_ROOT/asr_egs/tedlium/v2-30ms/exp/train_phn_l5_c320_v1s
# Defaults
frame_shift=0.03 # 30 ms frames
lm_weight=0.8 # same as best setting for 30ms eesen tedlium transcriber
. path.sh
. $BASEDIR/utils/parse_options.sh
filename=$(basename "$1")
basename="${filename%.*}"
dirname=$(dirname "$1")
extension="${filename##*.}"
if [ $# -ne 1 ]; then
echo "Usage: align.sh <basename>.{wav,mp3,mp4,sph}"
echo " in same folder is test text named <basename>.txt"
echo " and STM file named <basename>.stm (for segments)"
echo " ./align.sh /vagrant/GaryFlake_2010.wav"
echo " output is build/output/<basename>.ali"
exit 1;
fi
mkdir -p $BASEDIR/build/audio/base build/output
# un-shorten-ify SPH files
#if [ $extension == "sph" ]; then
# sph2pipe $1 > build/audio/base/$basename.unshorten
# sox build/audio/base/$basename.unshorten -c 1 build/audio/base/$basename.wav rate -v 16k
#fi
mkdir -p $BASEDIR/src-audio
cp $1 $BASEDIR/src-audio
make $BASEDIR/build/audio/base/$basename.wav
# 8k
# sox $1 -c 1 -e signed-integer build/audio/base/$basename.wav rate -v 8k
mkdir -p $BASEDIR/build/diarization/$basename
# make STM from cha
if [ -f $dirname/$basename.cha -a ! -f $dirname/$basename.stm ]; then
local/cha2stm.sh $dirname/$basename.cha | sed 's/xxx/\<unk\>/g' > build/output/$basename.stm
elif [ -f $dirname/$basename.stm ]; then
cp $dirname/$basename.stm build/output/
elif [ ! -f $dirname/$basename.stm ]; then
echo "Needs either a .cha or .stm file to get utterances"
exit 1
fi
#if [ ! -f $dirname/$basename.txt ]; then
# echo "Needs .txt file with utterance per line as reference text to align"
# exit 1
#fi
# make segments from $1.stm
cat build/output/$basename.stm | grep -v "inter_segment_gap" | grep -v "ignore_time_segment_in_scoring" | awk '{OFMT = "%.0f"; print $1,$2,$4*100,($5-$4)*100,"M S U",$2}' > build/diarization/$basename/show.seg
# Generate features
cd $BASEDIR
rm -rf build/trans/$basename
make SEGMENTS=show.seg build/trans/$basename/fbank
# Expect test text in format with utterance IDs per line
uttdata=build/trans/$basename
#if [ -f $dirname/$basename.txt ];
# then
# echo "Aligning text found at $dirname/$basename.txt"
# cat $dirname/$basename.txt | awk '{print NR" "$0}' > $uttdata/text
# else
echo "Aligning text found in build/output/$basename.stm"
cat build/output/$basename.stm | awk '{$1="";$2="";$3="";$4="";$5="";$6=""; print NR$0}' \
| sed 's/ \+/ /' > $uttdata/text
#fi
cp build/diarization/$basename/show.seg $uttdata
#local/align_ctc_multi_utts.sh --acoustic_scale 0.8 $GRAPH_DIR $GRAPH_DIR $uttdata $MODEL_DIR $uttdata/align
# <langdir> <data> <uttdata> <mdldir> <dir>
local/align_ctc_multi_utts.sh --acoustic_scale $lm_weight $GRAPH_DIR $GRAPH_DIR $uttdata $MODEL_DIR $uttdata/align
# Copy results to someplace useful
cp $uttdata/align/ali build/output/$basename.ali