Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

seems deadlock occurs when using multithread #75

Open
iou2much opened this issue Mar 26, 2020 · 6 comments
Open

seems deadlock occurs when using multithread #75

iou2much opened this issue Mar 26, 2020 · 6 comments
Labels
bug Something isn't working

Comments

@iou2much
Copy link

Here's the mtl_transformer.json for finetuning stage. I set the num_data_threads:32, and it hangs up at the end of the first training epoch.
horovodrun -np 4 -H localhost:4 python athena/horovod_main.py examples/asr/hkust/configs/mtl_transformer.json

{
"batch_size":24,
"num_epochs":50,
"sorta_epoch":1,
"ckpt":"examples/asr/hkust/ckpts/mtl_transformer_ctc/",
"summary_dir":"examples/asr/hkust/ckpts/mtl_transformer_ctc/event",

"solver_gpu":[0],
"solver_config":{
"clip_norm":100,
"log_interval":10,
"enable_tf_function":true
},

"model":"mtl_transformer_ctc",
"num_classes": null,
"pretrained_model": "examples/asr/hkust/configs/mpc.json",
"model_config":{
"model":"speech_transformer",
"model_config":{
"return_encoder_output":true,
"num_filters":512,
"d_model":512,
"num_heads":8,
"num_encoder_layers":12,
"num_decoder_layers":6,
"dff":1280,
"rate":0.1,
"label_smoothing_rate":0.0,
"schedual_sampling_rate":0.9
},
"mtl_weight":0.5
},

"decode_config":{
"beam_search":true,
"beam_size":10,
"ctc_weight":0.5,
"lm_type":"ngram",
"lm_weight":0.3,
"lm_path":"examples/asr/hkust/data/5gram.arpa"
},

"optimizer":"warmup_adam",
"optimizer_config":{
"d_model":512,
"warmup_steps":25000,
"k":1.0
},

"dataset_builder": "speech_recognition_dataset",
"num_data_threads": 12,
"trainset_config":{
"data_csv": "examples/asr/hkust/data/train.csv",
"audio_config":{"type":"Fbank", "filterbank_channel_count":40},
"cmvn_file":"examples/asr/hkust/data/cmvn",
"text_config": {"type":"vocab", "model":"examples/asr/hkust/data/vocab"},
"input_length_range":[10, 8000]
},
"devset_config":{
"data_csv": "examples/asr/hkust/data/dev.mini.csv",
"audio_config":{"type":"Fbank", "filterbank_channel_count":40},
"cmvn_file":"examples/asr/hkust/data/cmvn",
"text_config": {"type":"vocab", "model":"examples/asr/hkust/data/vocab"},
"input_length_range":[10, 8000]
},
"testset_config":{
"data_csv": "examples/asr/hkust/data/dev.mini.csv",
"audio_config":{"type":"Fbank", "filterbank_channel_count":40},
"cmvn_file":"examples/asr/hkust/data/cmvn",
"text_config": {"type":"vocab", "model":"examples/asr/hkust/data/vocab"}
}
}

Here's part of the strace log of one of the process:

restart_syscall(<... resuming interrupted futex ...>) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=448417000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=453560000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=458726000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=463892000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=469063000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=474251000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=479418000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=484583000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=489748000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=494910000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa745ac, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=500123000}, 0xffffffff) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=500123000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=505351000}, 0xffffffff) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=505351000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=510428000}, 0xffffffff) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=510428000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=515504000}, 0xffffffff) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=515504000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=520612000}, 0xffffffff) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=520612000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=525693000}, 0xffffffff) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=525693000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=530785000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 1
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=535930000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=541074000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=546212000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=551297000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=556410000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=561516000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=566631000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=571733000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=576836000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=581946000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=587075000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=592220000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=597350000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=602524000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=607693000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=612850000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=617997000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=623167000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=628321000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=633475000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=638630000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=643783000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=648937000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=654094000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=659251000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=664403000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=669553000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=674704000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=679851000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=684998000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=690118000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=695268000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=700402000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=705515000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=710628000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=715747000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=720839000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=725960000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=731070000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=736199000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=741361000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=746488000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=751589000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=756712000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=761827000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745ac, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=766948000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0
futex(0xa745a8, FUTEX_WAIT_BITSET_PRIVATE|FUTEX_CLOCK_REALTIME, 0, {tv_sec=1585236534, tv_nsec=772084000}, 0xffffffff) = -1 ETIMEDOUT (Connection timed out)
futex(0xa74540, FUTEX_WAKE_PRIVATE, 1) = 0

@iou2much
Copy link
Author

I change the num_data_threads to 2. It's working now...But wondering why ?

@tjadamlee
Copy link
Contributor

Thanks, this should be a bug. @8z8z8z please help check this

@iou2much
Copy link
Author

iou2much commented Mar 27, 2020

It might be related to this horovod issue . the smaller the num_data_threads I set, the faster I wait for the start of evaluation stage.
Also, my dev data is only 1000 records, just try it speed up the whole training process. Does it matter?
I think it might be the cause, small dev data and large num_data_threads, might cause inbalanced batch for each thread.

Here's the part of the log of evaluation:

[1,1]:INFO:absl:loading data using 2 threads
[1,0]:INFO:absl:>>>>> start evaluate in epoch 1
[1,0]:INFO:absl:hparams: [('audio_config', {'type': 'Fbank', 'filterbank_channel_count': 40}), ('cls', <class 'athena.data.datasets.speech_recognition.SpeechRecognitionDatasetBuilder'>), ('cmvn_file', 'examples/asr/seewo/data/cmvn'), ('data_csv', 'examples/asr/seewo/data/dev.mini.csv'), ('input_length_range', [10, 8000]), ('output_length_range', [1, 10000]), ('remove_unk', True), ('speed_permutation', [1.0]), ('text_config', {'type': 'vocab', 'model': 'examples/asr/seewo/data/vocab'})]
[1,0]:Fbank params: [('channel', 1), ('cls', <class 'athena.transform.feats.fbank.Fbank'>), ('delta_delta', False), ('dither', 0.0), ('filterbank_channel_count', 40), ('frame_length', 0.01), ('global_mean', [0.0]), ('global_variance', [1.000001]), ('is_fbank', True), ('local_cmvn', False), ('lower_frequency_limit', 60), ('order', 2), ('output_type', 1), ('preEph_coeff', 0.97), ('raw_energy', 1), ('remove_dc_offset', True), ('snip_edges', 1), ('type', 'Fbank'), ('upper_frequency_limit', 0), ('window', 2), ('window_length', 0.025), ('window_type', 'povey')]
[1,0]:INFO:absl:Successfully load cmvn file examples/asr/seewo/data/cmvn
[1,0]:INFO:absl:Loading data from examples/asr/seewo/data/dev.mini.csv
[1,0]:INFO:absl:loading data using 2 threads
[1,3]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,1]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,0]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,2]:WARNING:absl:the length of logits is shorter than that of labels
[1,0]:WARNING:absl:the length of logits is shorter than that of labels
[1,3]:WARNING:absl:the length of logits is shorter than that of labels
[1,1]:WARNING:absl:the length of logits is shorter than that of labels
[1,2]:INFO:absl:perform batch_wise_shuffle with batch_size 24
[1,2]:INFO:absl:loading data using 2 threads
[1,2]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,0]:INFO:absl:loss: 8.7214 Accuracy: 0.7576 CTCAccuracy: 0.6364
[1,0]:INFO:absl:epoch: 1 loss: 16.0623 Accuracy: 0.7133 CTCAccuracy: 0.5413
[1,0]:INFO:absl:saving model in :examples/asr/seewo/ckpts/mtl_transformer_ctc/ckpt
[1,0]:INFO:absl:>>>>> start training in epoch 2
[1,0]:INFO:absl:perform batch_wise_shuffle with batch_size 24
[1,0]:INFO:absl:loading data using 2 threads
[1,0]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,1]:INFO:absl:perform batch_wise_shuffle with batch_size 24
[1,1]:INFO:absl:loading data using 2 threads
[1,3]:INFO:absl:perform batch_wise_shuffle with batch_size 24
[1,3]:INFO:absl:loading data using 2 threads
[1,1]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,3]:INFO:absl:please be patient, enable tf.function, it takes time ...
[1,2]:WARNING:absl:the length of logits is shorter than that of labels
[1,0]:WARNING:absl:the length of logits is shorter than that of labels
[1,3]:WARNING:absl:the length of logits is shorter than that of labels
[1,1]:WARNING:absl:the length of logits is shorter than that of labels
[1,0]:[2020-03-26 23:54:49.611962: W horovod/common/stall_inspector.cc:105] One or more tensors were submitted to be reduced, gathered or broadcasted by subset of ranks and are waiting for remainder of ranks for more than 60 seconds. This may indicate that different ranks are trying to submit different tensors or that only subset of ranks is submitting tensors, which will cause deadlock.
[1,0]:Stalled ranks:
[1,0]:0: [DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_730_0, DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_731_0]
[1,0]:1: [DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_730_0, DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_731_0]
[1,0]:3: [DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_730_0, DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_731_0]
[1,0]:[2020-03-26 23:55:49.615167: W horovod/common/stall_inspector.cc:105] One or more tensors were submitted to be reduced, gathered or broadcasted by subset of ranks and are waiting for remainder of ranks for more than 60 seconds. This may indicate that different ranks are trying to submit different tensors or that only subset of ranks is submitting tensors, which will cause deadlock.
[1,0]:Stalled ranks:
[1,0]:0: [DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_730_0, DistributedGradientTape_Allreduce/HorovodAllgather_Reshape_731_0, DistributedGradientTape_Allreduce/HorovodAllreduce_BiasAddGrad_0, DistributedGradientTape_Allreduce/HorovodAllreduce_BiasAddGrad_100_0, DistributedGradientTape_Allreduce/HorovodAllreduce_BiasAddGrad_101_0, DistributedGradientTape_Allreduce/HorovodAllreduce_BiasAddGrad_102_0 ...]

@stale
Copy link

stale bot commented Apr 23, 2020

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.

@stale stale bot added the stale label Apr 23, 2020
@Some-random
Copy link
Collaborator

Keeping this issue open. This is still a bug.

@stale stale bot removed the stale label Apr 23, 2020
@Some-random Some-random added the bug Something isn't working label Apr 23, 2020
@JianweiSun007
Copy link
Collaborator

This is a script for auto install horovod in centos and may help you
https://github.com/8z8z8z/athena/blob/master/tools/build_horovod_centos_env.sh

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

4 participants