From 1407f08e42306d390366a5538a78235afa41186c Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 26 Dec 2023 18:01:04 +0100 Subject: [PATCH 01/19] BART pipeline and config --- configs/BARTdenoise.yaml | 49 +++++++++++ pipelines/BART/main.py | 181 +++++++++++++++++++++++++++++++++++++++ train.py | 3 + 3 files changed, 233 insertions(+) create mode 100644 configs/BARTdenoise.yaml create mode 100644 pipelines/BART/main.py diff --git a/configs/BARTdenoise.yaml b/configs/BARTdenoise.yaml new file mode 100644 index 0000000..3cc911b --- /dev/null +++ b/configs/BARTdenoise.yaml @@ -0,0 +1,49 @@ +train: + num_epochs: 5 + accum_iter: 10 + batch_size: 8 + base_lr: 3e-5 + warmup: 4000 + finetune: True + +model_name: BART +dataset_name: 'roszcz/maestro-v1-sustain' +target: denoise +seed: 26 + +overfit: False + +tokens_per_note: multiple +time_quantization_method: start +masking_probability: 0.15 +mask: notes + +encoder: velocity +time_bins: 100 + +dataset: + sequence_duration: 5 + sequence_step: 2 + + quantization: + start: 50 + duration: 5 + velocity: 3 + +device: "cuda:0" + +log: True +log_frequency: 10 +run_name: midi-T5-${now:%Y-%m-%d-%H-%M} +project: "midi-hf-transformer" + +pre_defined_model: null + +model: + encoder_layers: 6, + encoder_ffn_dim: 2048, + encoder_attention_heads: 8, + decoder_layers: 6, + decoder_ffn_dim: 2048, + decoder_attention_heads: 8, + d_model: 512 diff --git a/pipelines/BART/main.py b/pipelines/BART/main.py new file mode 100644 index 0000000..5ff353e --- /dev/null +++ b/pipelines/BART/main.py @@ -0,0 +1,181 @@ +import torch +from datasets import Dataset +from omegaconf import OmegaConf, DictConfig +from transformers import BartConfig, BartForConditionalGeneration + +from utils import vocab_size +from training_utils import train_model +from data.dataset import MaskedMidiDataset, MyTokenizedMidiDataset +from data.midiencoder import VelocityEncoder, QuantizedMidiEncoder +from data.maskedmidiencoder import MaskedMidiEncoder, MaskedNoteEncoder +from data.multitokencoder import MultiMidiEncoder, MultiStartEncoder, MultiVelocityEncoder + + +def main( + cfg: DictConfig, + train_translation_dataset: Dataset, + val_translation_dataset: Dataset, +): + checkpoint = None + if cfg.target == "denoise": + train_dataset, val_dataset = create_masked_datasets( + cfg=cfg, + train_translation_dataset=train_translation_dataset, + val_translation_dataset=val_translation_dataset, + ) + elif cfg.train.finetune: + checkpoint = torch.load(f"checkpoints/denoise/{cfg.pretrained_checkpoint}", map_location=cfg.device) + pretrain_cfg = OmegaConf.create(checkpoint["cfg"]) + # make current cfg fit pre-train cfg but keep relevant info from current config + pretrain_cfg.device = cfg.device + pretrain_cfg.target = cfg.target + pretrain_cfg.train.finetune = True + pretrain_cfg.train.base_lr = cfg.train.base_lr + pretrain_cfg.run_name = cfg.run_name + cfg = pretrain_cfg + + train_dataset, val_dataset = create_datasets_finetune( + cfg=cfg, + train_translation_dataset=train_translation_dataset, + val_translation_dataset=val_translation_dataset, + ) + else: + train_dataset, val_dataset = create_datasets( + cfg=cfg, + train_translation_dataset=train_translation_dataset, + val_translation_dataset=val_translation_dataset, + ) + if cfg.pre_defined_model is not None: + model_cfg = OmegaConf.load(f"configs/architectures/{cfg.pre_defined_model}.yaml") + cfg.model = model_cfg + + start_token_id: int = train_dataset.encoder.token_to_id[""] + pad_token_id: int = train_dataset.encoder.token_to_id[""] + config = BartConfig( + vocab_size=vocab_size(cfg), + decoder_start_token_id=start_token_id, + pad_token_id=pad_token_id, + eos_token_id=pad_token_id, + use_cache=False, + d_model=cfg.model.d_model, + encoder_layers=cfg.model.encoder_layers, + decoder_layers=cfg.model.decoder_layers, + encoder_ffn_dim=cfg.model.encoder_ffn_dim, + decoder_ffn_dim=cfg.model.decoder_ffn_dim, + encoder_attention_heads=cfg.model.encoder_attention_heads, + decoder_attention_heads=cfg.model.decoder_attention_heads, + ) + + model = BartForConditionalGeneration(config) + if checkpoint is not None: + # Pre-trained model has to be trained with the same vocab_size as our model. + # To do that, pre-trained model has to be trained using a base_encoder, + # initialized the same way as our tokenizer. + model.load_state_dict(checkpoint["model_state_dict"]) + + train_model( + model=model, + train_dataset=train_dataset, + val_dataset=val_dataset, + cfg=cfg, + ) + + print(cfg.run_name) + + +def create_datasets( + cfg: DictConfig, + train_translation_dataset: Dataset, + val_translation_dataset: Dataset, +) -> tuple[MyTokenizedMidiDataset, MyTokenizedMidiDataset]: + if cfg.tokens_per_note == "multiple": + if cfg.target == "velocity": + tokenizer = MultiVelocityEncoder( + quantization_cfg=cfg.dataset.quantization, + time_quantization_method=cfg.time_quantization_method, + ) + else: + tokenizer = MultiStartEncoder( + quantization_cfg=cfg.dataset.quantization, + time_quantization_method=cfg.time_quantization_method, + tgt_bins=cfg.start_bins, + ) + else: + tokenizer = VelocityEncoder( + quantization_cfg=cfg.dataset.quantization, + time_quantization_method=cfg.time_quantization_method, + ) + + train_dataset = MyTokenizedMidiDataset( + dataset=train_translation_dataset, + dataset_cfg=cfg.dataset, + encoder=tokenizer, + ) + val_dataset = MyTokenizedMidiDataset( + dataset=val_translation_dataset, + dataset_cfg=cfg.dataset, + encoder=tokenizer, + ) + return train_dataset, val_dataset + + +def create_masked_datasets( + cfg: DictConfig, + train_translation_dataset: Dataset, + val_translation_dataset: Dataset, +) -> tuple[MyTokenizedMidiDataset, MyTokenizedMidiDataset]: + if cfg.tokens_per_note == "multiple": + base_encoder = MultiMidiEncoder( + quantization_cfg=cfg.dataset.quantization, time_quantization_method=cfg.time_quantization_method + ) + else: + base_encoder = QuantizedMidiEncoder(cfg.dataset.quantization, cfg.time_quantization_method) + if cfg.mask == "notes": + encoder = MaskedNoteEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) + else: + encoder = MaskedMidiEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) + + train_dataset = MaskedMidiDataset( + dataset=train_translation_dataset, + dataset_cfg=cfg.dataset, + base_encoder=base_encoder, + encoder=encoder, + ) + + val_dataset = MaskedMidiDataset( + dataset=val_translation_dataset, + dataset_cfg=cfg.dataset, + base_encoder=base_encoder, + encoder=encoder, + ) + + return train_dataset, val_dataset + + +def create_datasets_finetune( + cfg: DictConfig, + train_translation_dataset: Dataset, + val_translation_dataset: Dataset, +) -> tuple[MyTokenizedMidiDataset, MyTokenizedMidiDataset]: + tokenizer = MultiMidiEncoder( + quantization_cfg=cfg.dataset.quantization, + time_quantization_method=cfg.time_quantization_method, + ) + pretraining_tokenizer = MaskedMidiEncoder( + base_encoder=tokenizer, + ) + # use the same token ids as used during pre-training + tokenizer.vocab = pretraining_tokenizer.vocab + tokenizer.token_to_id = pretraining_tokenizer.token_to_id + train_dataset = MyTokenizedMidiDataset( + dataset=train_translation_dataset, + dataset_cfg=cfg.dataset, + encoder=tokenizer, + ) + val_dataset = MyTokenizedMidiDataset( + dataset=val_translation_dataset, + dataset_cfg=cfg.dataset, + encoder=tokenizer, + ) + + return train_dataset, val_dataset diff --git a/train.py b/train.py index 052c87e..4fa1087 100644 --- a/train.py +++ b/train.py @@ -6,6 +6,7 @@ import wandb from data.dataset import load_cache_dataset from pipelines.T5.main import main as t5_training +from pipelines.BART.main import main as bart_training def initialize_wandb(cfg: DictConfig): @@ -51,6 +52,8 @@ def main(cfg: DictConfig): print("Connection error, trying again...") if cfg.model_name == "T5": t5_training(cfg, train_translation_dataset, val_translation_dataset) + elif cfg.model_name == "BART": + bart_training(cfg, train_translation_dataset, val_translation_dataset) if __name__ == "__main__": From b67026ca77f89e9f0e77fec3a7b2b3000acbe3cd Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 26 Dec 2023 18:02:49 +0100 Subject: [PATCH 02/19] fix config --- configs/BARTdenoise.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/BARTdenoise.yaml b/configs/BARTdenoise.yaml index 3cc911b..c7ad1ea 100644 --- a/configs/BARTdenoise.yaml +++ b/configs/BARTdenoise.yaml @@ -34,8 +34,8 @@ device: "cuda:0" log: True log_frequency: 10 -run_name: midi-T5-${now:%Y-%m-%d-%H-%M} -project: "midi-hf-transformer" +run_name: midi-bart-${now:%Y-%m-%d-%H-%M} +project: "midi-bart" pre_defined_model: null From b3c959a67213c4f715ed90f5dd5f224ad1fb9e23 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 26 Dec 2023 18:04:55 +0100 Subject: [PATCH 03/19] fix config --- configs/BARTdenoise.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/BARTdenoise.yaml b/configs/BARTdenoise.yaml index c7ad1ea..c01c8b6 100644 --- a/configs/BARTdenoise.yaml +++ b/configs/BARTdenoise.yaml @@ -15,7 +15,7 @@ overfit: False tokens_per_note: multiple time_quantization_method: start -masking_probability: 0.15 +masking_probability: 0.3 mask: notes encoder: velocity From cdfe0c96b4c4c5e5df6b2cdf14331c41e14d2391 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 26 Dec 2023 18:07:11 +0100 Subject: [PATCH 04/19] dashboard for bart --- dashboard/denoise/main.py | 47 +++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/dashboard/denoise/main.py b/dashboard/denoise/main.py index 1db7407..f7f86e6 100644 --- a/dashboard/denoise/main.py +++ b/dashboard/denoise/main.py @@ -11,7 +11,7 @@ from fortepyan import MidiPiece from omegaconf import OmegaConf, DictConfig from streamlit_pianoroll import from_fortepyan -from transformers import T5Config, T5ForConditionalGeneration +from transformers import T5Config, T5ForConditionalGeneration, BartConfig, BartForConditionalGeneration from utils import vocab_size from data.midiencoder import QuantizedMidiEncoder @@ -132,20 +132,39 @@ def model_predictions_review( start_token_id: int = dataset.encoder.token_to_id[""] pad_token_id: int = dataset.encoder.token_to_id[""] - config = T5Config( - vocab_size=vocab_size(train_cfg), - decoder_start_token_id=start_token_id, - pad_token_id=pad_token_id, - eos_token_id=pad_token_id, - use_cache=False, - d_model=train_cfg.model.d_model, - d_kv=train_cfg.model.d_kv, - d_ff=train_cfg.model.d_ff, - num_layers=train_cfg.model.num_layers, - num_heads=train_cfg.model.num_heads, - ) + if train_cfg.model_name == "T5": + config = T5Config( + vocab_size=vocab_size(train_cfg), + decoder_start_token_id=start_token_id, + pad_token_id=pad_token_id, + eos_token_id=pad_token_id, + use_cache=False, + d_model=train_cfg.model.d_model, + d_kv=train_cfg.model.d_kv, + d_ff=train_cfg.model.d_ff, + num_layers=train_cfg.model.num_layers, + num_heads=train_cfg.model.num_heads, + ) + + model = T5ForConditionalGeneration(config) + elif train_cfg.model_name == "BART": + config = BartConfig( + vocab_size=vocab_size(cfg), + decoder_start_token_id=start_token_id, + pad_token_id=pad_token_id, + eos_token_id=pad_token_id, + use_cache=False, + d_model=train_cfg.model.d_model, + encoder_layers=train_cfg.model.encoder_layers, + decoder_layers=train_cfg.model.decoder_layers, + encoder_ffn_dim=train_cfg.model.encoder_ffn_dim, + decoder_ffn_dim=train_cfg.model.decoder_ffn_dim, + encoder_attention_heads=train_cfg.model.encoder_attention_heads, + decoder_attention_heads=train_cfg.model.decoder_attention_heads, + ) + + model = BartForConditionalGeneration(config) - model = T5ForConditionalGeneration(config) model.load_state_dict(checkpoint["model_state_dict"]) model.eval().to(DEVICE) From 1933218a837f3486b41e7f78f01e322eb44c807e Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 26 Dec 2023 20:01:07 +0100 Subject: [PATCH 05/19] fix config --- configs/BARTdenoise.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/BARTdenoise.yaml b/configs/BARTdenoise.yaml index c01c8b6..165d016 100644 --- a/configs/BARTdenoise.yaml +++ b/configs/BARTdenoise.yaml @@ -40,10 +40,10 @@ project: "midi-bart" pre_defined_model: null model: - encoder_layers: 6, - encoder_ffn_dim: 2048, - encoder_attention_heads: 8, - decoder_layers: 6, - decoder_ffn_dim: 2048, - decoder_attention_heads: 8, + encoder_layers: 6 + encoder_ffn_dim: 2048 + encoder_attention_heads: 8 + decoder_layers: 6 + decoder_ffn_dim: 2048 + decoder_attention_heads: 8 d_model: 512 From c73e56023d4dcb5093361a4762baa5b2b90d5a5f Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Thu, 28 Dec 2023 10:12:56 +0100 Subject: [PATCH 06/19] velocity prediction bart finetuning --- configs/BARTdenoise.yaml | 2 +- configs/BARTvelocity.yaml | 44 ++++++++++++++++++++++++++++++++++ dashboard/denoise/main.py | 4 ++-- dashboard/download_models.py | 13 +++------- dashboard/velocity/main.py | 46 +++++++++++++++++++++++++----------- 5 files changed, 82 insertions(+), 27 deletions(-) create mode 100644 configs/BARTvelocity.yaml diff --git a/configs/BARTdenoise.yaml b/configs/BARTdenoise.yaml index 165d016..fee9100 100644 --- a/configs/BARTdenoise.yaml +++ b/configs/BARTdenoise.yaml @@ -2,7 +2,7 @@ train: num_epochs: 5 accum_iter: 10 batch_size: 8 - base_lr: 3e-5 + base_lr: 5e-7 warmup: 4000 finetune: True diff --git a/configs/BARTvelocity.yaml b/configs/BARTvelocity.yaml new file mode 100644 index 0000000..30507a3 --- /dev/null +++ b/configs/BARTvelocity.yaml @@ -0,0 +1,44 @@ +train: + num_epochs: 5 + accum_iter: 10 + batch_size: 8 + base_lr: 5e-6 + warmup: 4000 + finetune: True + +pretrained_checkpoint: midi-bart-2023-12-26-19-05.pt +model_name: BART +dataset_name: 'roszcz/maestro-v1-sustain' +target: velocity +seed: 26 + +overfit: False + +tokens_per_note: "multiple" +time_quantization_method: start +dataset: + sequence_duration: 5 + sequence_step: 2 + + quantization: + start: 20 + duration: 3 + velocity: 3 + +device: "cuda:0" + +log: True +log_frequency: 10 +run_name: midi-bart-${now:%Y-%m-%d-%H-%M} +project: "midi-bart" + +pre_defined_model: null + +model: + encoder_layers: 6 + encoder_ffn_dim: 2048 + encoder_attention_heads: 8 + decoder_layers: 6 + decoder_ffn_dim: 2048 + decoder_attention_heads: 8 + d_model: 512 diff --git a/dashboard/denoise/main.py b/dashboard/denoise/main.py index f7f86e6..7d3134b 100644 --- a/dashboard/denoise/main.py +++ b/dashboard/denoise/main.py @@ -11,7 +11,7 @@ from fortepyan import MidiPiece from omegaconf import OmegaConf, DictConfig from streamlit_pianoroll import from_fortepyan -from transformers import T5Config, T5ForConditionalGeneration, BartConfig, BartForConditionalGeneration +from transformers import T5Config, BartConfig, T5ForConditionalGeneration, BartForConditionalGeneration from utils import vocab_size from data.midiencoder import QuantizedMidiEncoder @@ -149,7 +149,7 @@ def model_predictions_review( model = T5ForConditionalGeneration(config) elif train_cfg.model_name == "BART": config = BartConfig( - vocab_size=vocab_size(cfg), + vocab_size=vocab_size(train_cfg), decoder_start_token_id=start_token_id, pad_token_id=pad_token_id, eos_token_id=pad_token_id, diff --git a/dashboard/download_models.py b/dashboard/download_models.py index 25e39ea..9472574 100644 --- a/dashboard/download_models.py +++ b/dashboard/download_models.py @@ -1,17 +1,10 @@ from huggingface_hub import hf_hub_download -FILENAME_VELOCITY = "velocity-T5-2023-11-11-10-29.pt" -FILENAME_DENOISE = "midi-T5-2023-11-11-10-29.pt" +# FILENAME_VELOCITY = "velocity-T5-2023-11-11-10-29.pt" +FILENAME_DENOISE = "midi-bart-2023-12-26-19-05.pt" - -hf_hub_download( - repo_id="wmatejuk/midi-T5-velocity", - filename=FILENAME_VELOCITY, - local_dir="checkpoints/velocity", - local_dir_use_symlinks=False, -) hf_hub_download( - repo_id="wmatejuk/midi-T5-denoise", + repo_id="wmatejuk/midi-bart-denoise", filename=FILENAME_DENOISE, local_dir="checkpoints/denoise", local_dir_use_symlinks=False, diff --git a/dashboard/velocity/main.py b/dashboard/velocity/main.py index 6081382..41873d8 100644 --- a/dashboard/velocity/main.py +++ b/dashboard/velocity/main.py @@ -10,7 +10,7 @@ from fortepyan import MidiPiece from omegaconf import OmegaConf, DictConfig from streamlit_pianoroll import from_fortepyan -from transformers import T5Config, T5ForConditionalGeneration +from transformers import T5Config, BartConfig, T5ForConditionalGeneration, BartForConditionalGeneration from utils import vocab_size from data.midiencoder import VelocityEncoder @@ -115,20 +115,38 @@ def model_predictions_review( ) start_token_id: int = dataset.encoder.token_to_id[""] pad_token_id: int = dataset.encoder.token_to_id[""] - config = T5Config( - vocab_size=vocab_size(train_cfg), - decoder_start_token_id=start_token_id, - pad_token_id=pad_token_id, - eos_token_id=pad_token_id, - use_cache=False, - d_model=train_cfg.model.d_model, - d_kv=train_cfg.model.d_kv, - d_ff=train_cfg.model.d_ff, - num_layers=train_cfg.model.num_layers, - num_heads=train_cfg.model.num_heads, - ) + if train_cfg.model_name == "T5": + config = T5Config( + vocab_size=vocab_size(train_cfg), + decoder_start_token_id=start_token_id, + pad_token_id=pad_token_id, + eos_token_id=pad_token_id, + use_cache=False, + d_model=train_cfg.model.d_model, + d_kv=train_cfg.model.d_kv, + d_ff=train_cfg.model.d_ff, + num_layers=train_cfg.model.num_layers, + num_heads=train_cfg.model.num_heads, + ) - model = T5ForConditionalGeneration(config) + model = T5ForConditionalGeneration(config) + elif train_cfg.model_name == "BART": + config = BartConfig( + vocab_size=vocab_size(train_cfg), + decoder_start_token_id=start_token_id, + pad_token_id=pad_token_id, + eos_token_id=pad_token_id, + use_cache=False, + d_model=train_cfg.model.d_model, + encoder_layers=train_cfg.model.encoder_layers, + decoder_layers=train_cfg.model.decoder_layers, + encoder_ffn_dim=train_cfg.model.encoder_ffn_dim, + decoder_ffn_dim=train_cfg.model.decoder_ffn_dim, + encoder_attention_heads=train_cfg.model.encoder_attention_heads, + decoder_attention_heads=train_cfg.model.decoder_attention_heads, + ) + + model = BartForConditionalGeneration(config) model.load_state_dict(checkpoint["model_state_dict"]) model.eval().to(DEVICE) From a81c6502518647be5a87b21f02d397c929e1e093 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Thu, 28 Dec 2023 10:28:28 +0100 Subject: [PATCH 07/19] update title --- dashboard/denoise/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard/denoise/main.py b/dashboard/denoise/main.py index 7d3134b..ada8ef2 100644 --- a/dashboard/denoise/main.py +++ b/dashboard/denoise/main.py @@ -21,7 +21,7 @@ from data.maskedmidiencoder import MaskedMidiEncoder, MaskedNoteEncoder # Set the layout of the Streamlit page -st.set_page_config(layout="wide", page_title="T5 Denoise", page_icon=":musical_keyboard") +st.set_page_config(layout="wide", page_title="Denoise MIDI", page_icon=":musical_keyboard") with st.sidebar: devices = ["cpu"] + [f"cuda:{it}" for it in range(torch.cuda.device_count())] From ad2f4aeb2cef1a45cbb2c0e05cef1f26654f574e Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Thu, 28 Dec 2023 13:22:07 +0100 Subject: [PATCH 08/19] pianoroll with predicted notes --- dashboard/denoise/main.py | 11 +++++++---- requirements.txt | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dashboard/denoise/main.py b/dashboard/denoise/main.py index ada8ef2..a8a5567 100644 --- a/dashboard/denoise/main.py +++ b/dashboard/denoise/main.py @@ -147,7 +147,7 @@ def model_predictions_review( ) model = T5ForConditionalGeneration(config) - elif train_cfg.model_name == "BART": + else: config = BartConfig( vocab_size=vocab_size(train_cfg), decoder_start_token_id=start_token_id, @@ -173,7 +173,7 @@ def model_predictions_review( n_samples: int = 5 np.random.seed(random_seed) - idxs: np.ndarray[int] = np.random.randint(len(dataset), size=n_samples) + ids: int | np.ndarray[int] = np.random.randint(len(dataset), size=n_samples) cols = st.columns(2) with cols[0]: @@ -186,7 +186,7 @@ def model_predictions_review( # widget id for streamlit_pianoroll widget key = 0 - for record_id in idxs: + for record_id in ids: # Numpy to int :( record: dict = dataset.get_complete_record(int(record_id)) record_source: dict = json.loads(record["source"]) @@ -206,8 +206,11 @@ def model_predictions_review( df["mask"] = generated_df["mask"] # create quantized piece with predicted notes pred_piece = MidiPiece(df) + unmasked_notes_df = generated_df[generated_df["mask"]] + unmasked_notes_piece = MidiPiece(unmasked_notes_df) except ValueError: + # create an empty piece if the model did not generate the structure correctly generated_df = pd.DataFrame([[23.0, 1.0, 1.0, 1.0, 1.0]], columns=midi_columns) generated_df["mask"] = [False] pred_piece = MidiPiece(generated_df) @@ -236,7 +239,7 @@ def model_predictions_review( # Predicted fig = ff.view.draw_dual_pianoroll(pred_piece) st.pyplot(fig) - from_fortepyan(pred_piece, key=key + 1) + from_fortepyan(pred_piece, secondary_piece=unmasked_notes_piece, key=key + 1) st.markdown("**Predicted tokens:**") st.markdown(generated_tokens) key += 2 diff --git a/requirements.txt b/requirements.txt index 43e767e..ddabe49 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ pandas~=2.0.0 PyYAML~=5.4.1 streamlit~=1.25.0 -streamlit-pianoroll~=0.1.3 +streamlit-pianoroll~=0.3.2 torch~=2.0.0 tqdm~=4.65.0 transformers~=4.28.1 From 4099083285c03f4317bd20af34bae660b278b5a3 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Thu, 28 Dec 2023 15:40:34 +0100 Subject: [PATCH 09/19] piece and composer selection --- dashboard/denoise/main.py | 80 +++++++++++++++++++++++++++----------- dashboard/velocity/main.py | 45 ++++++++++++++------- 2 files changed, 89 insertions(+), 36 deletions(-) diff --git a/dashboard/denoise/main.py b/dashboard/denoise/main.py index a8a5567..5bf1784 100644 --- a/dashboard/denoise/main.py +++ b/dashboard/denoise/main.py @@ -7,8 +7,8 @@ import pandas as pd import fortepyan as ff import streamlit as st -from datasets import Dataset from fortepyan import MidiPiece +from datasets import Dataset, load_dataset from omegaconf import OmegaConf, DictConfig from streamlit_pianoroll import from_fortepyan from transformers import T5Config, BartConfig, T5ForConditionalGeneration, BartForConditionalGeneration @@ -17,8 +17,8 @@ from data.midiencoder import QuantizedMidiEncoder from data.multitokencoder import MultiMidiEncoder from data.quantizer import MidiQuantizer, MidiATQuantizer -from data.dataset import MaskedMidiDataset, load_cache_dataset from data.maskedmidiencoder import MaskedMidiEncoder, MaskedNoteEncoder +from data.dataset import MaskedMidiDataset, build_translation_dataset, build_AT_translation_dataset # Set the layout of the Streamlit page st.set_page_config(layout="wide", page_title="Denoise MIDI", page_icon=":musical_keyboard") @@ -74,23 +74,24 @@ def main(): ) -def model_predictions_review( - checkpoint: dict, - train_cfg: DictConfig, -): - # load checkpoint, force dashboard device - dataset_cfg: DictConfig = train_cfg.dataset +def dataset_selection(train_cfg: DictConfig): dataset_name: str = st.text_input(label="dataset", value=train_cfg.dataset_name) split: str = st.text_input(label="split", value="test") - random_seed: int = st.selectbox(label="random seed", options=range(20)) - # load translation dataset and create MyTokenizedMidiDataset - val_translation_dataset: Dataset = load_cache_dataset( - dataset_cfg=dataset_cfg, - dataset_name=dataset_name, - split=split, - ) + val_translation_dataset: Dataset = load_dataset(path=dataset_name, split=split) + return val_translation_dataset + + +def create_dataset(base_dataset: Dataset, train_cfg: DictConfig): + # load checkpoint, force dashboard device + dataset_cfg: DictConfig = train_cfg.dataset + + if "dstart" in dataset_cfg.quantization: + translation_dataset = build_translation_dataset(base_dataset, dataset_cfg) + else: + translation_dataset = build_AT_translation_dataset(base_dataset, dataset_cfg) + if train_cfg.time_quantization_method == "start": quantizer = MidiATQuantizer( n_duration_bins=dataset_cfg.quantization.duration, @@ -124,11 +125,42 @@ def model_predictions_review( encoder = MaskedMidiEncoder(base_encoder=base_tokenizer, masking_probability=train_cfg.masking_probability) dataset = MaskedMidiDataset( - dataset=val_translation_dataset, + dataset=translation_dataset, dataset_cfg=train_cfg.dataset, base_encoder=base_tokenizer, encoder=encoder, ) + return dataset, quantizer + + +def model_predictions_review( + checkpoint: dict, + train_cfg: DictConfig, +): + midi_dataset = dataset_selection(train_cfg=train_cfg) + source_df = midi_dataset.to_pandas() + composers = source_df.composer.unique() + selected_composer = st.selectbox( + label="Select composer", + options=composers, + index=3, + ) + + ids = source_df.composer == selected_composer + piece_titles = source_df[ids].title.unique() + selected_title = st.selectbox( + label="Select title", + options=piece_titles, + ) + st.write(selected_title) + + ids = (source_df.composer == selected_composer) & (source_df.title == selected_title) + part_df = source_df[ids] + part_dataset = midi_dataset.select(part_df.index.values) + + dataset, quantizer = create_dataset(part_dataset, train_cfg=train_cfg) + + random_seed: int = st.selectbox(label="random seed", options=range(20)) start_token_id: int = dataset.encoder.token_to_id[""] pad_token_id: int = dataset.encoder.token_to_id[""] @@ -201,7 +233,7 @@ def model_predictions_review( true_piece = MidiPiece(df=true_notes, source=record_source) true_piece.time_shift(-true_piece.df.start.min()) try: - generated_df: pd.DataFrame = encoder.decode(src_token_ids, generated_token_ids) + generated_df: pd.DataFrame = dataset.encoder.decode(src_token_ids, generated_token_ids) df = quantizer.apply_quantization(generated_df) df["mask"] = generated_df["mask"] # create quantized piece with predicted notes @@ -230,18 +262,20 @@ def model_predictions_review( st.pyplot(fig) from_fortepyan(true_piece, key=key) # Unchanged - st.markdown("**Source tokens:**") - st.markdown(source_tokens) - st.markdown("**Target tokens:**") - st.markdown(tgt_tokens) + with st.expander(label="original tokens", expanded=False): + st.markdown("**Source tokens:**") + st.markdown(source_tokens) + st.markdown("**Target tokens:**") + st.markdown(tgt_tokens) with cols[1]: # Predicted fig = ff.view.draw_dual_pianoroll(pred_piece) st.pyplot(fig) from_fortepyan(pred_piece, secondary_piece=unmasked_notes_piece, key=key + 1) - st.markdown("**Predicted tokens:**") - st.markdown(generated_tokens) + with st.expander(label="predicted tokens", expanded=False): + st.markdown("**Predicted tokens:**") + st.markdown(generated_tokens) key += 2 diff --git a/dashboard/velocity/main.py b/dashboard/velocity/main.py index 41873d8..8cdc959 100644 --- a/dashboard/velocity/main.py +++ b/dashboard/velocity/main.py @@ -8,14 +8,15 @@ import fortepyan as ff import streamlit as st from fortepyan import MidiPiece +from datasets import Dataset, load_dataset from omegaconf import OmegaConf, DictConfig from streamlit_pianoroll import from_fortepyan from transformers import T5Config, BartConfig, T5ForConditionalGeneration, BartForConditionalGeneration from utils import vocab_size from data.midiencoder import VelocityEncoder +from data.dataset import MyTokenizedMidiDataset from data.maskedmidiencoder import MaskedMidiEncoder -from data.dataset import MyTokenizedMidiDataset, load_cache_dataset from data.multitokencoder import MultiMidiEncoder, MultiVelocityEncoder # Set the layout of the Streamlit page @@ -67,23 +68,41 @@ def main(): ) +def dataset_selection(train_cfg: DictConfig): + dataset_name: str = st.text_input(label="dataset", value=train_cfg.dataset_name) + split: str = st.text_input(label="split", value="test") + + # load translation dataset and create MyTokenizedMidiDataset + val_translation_dataset: Dataset = load_dataset(path=dataset_name, split=split) + return val_translation_dataset + + def model_predictions_review( checkpoint: dict, train_cfg: DictConfig, ): - # load checkpoint, force dashboard device - dataset_cfg = train_cfg.dataset - dataset_name = st.text_input(label="dataset", value=train_cfg.dataset_name) - split = st.text_input(label="split", value="test") - - random_seed = st.selectbox(label="random seed", options=range(20)) + midi_dataset = dataset_selection(train_cfg=train_cfg) + source_df = midi_dataset.to_pandas() + composers = source_df.composer.unique() + selected_composer = st.selectbox( + label="Select composer", + options=composers, + index=3, + ) - # load translation dataset and create MyTokenizedMidiDataset - val_translation_dataset = load_cache_dataset( - dataset_cfg=dataset_cfg, - dataset_name=dataset_name, - split=split, + ids = source_df.composer == selected_composer + piece_titles = source_df[ids].title.unique() + selected_title = st.selectbox( + label="Select title", + options=piece_titles, ) + st.write(selected_title) + + ids = (source_df.composer == selected_composer) & (source_df.title == selected_title) + part_df = source_df[ids] + part_dataset = midi_dataset.select(part_df.index.values) + + random_seed = st.selectbox(label="random seed", options=range(20)) if "finetune" in train_cfg.train and train_cfg.train.finetune: tokenizer = MultiMidiEncoder( @@ -109,7 +128,7 @@ def model_predictions_review( ) dataset = MyTokenizedMidiDataset( - dataset=val_translation_dataset, + dataset=part_dataset, dataset_cfg=train_cfg.dataset, encoder=tokenizer, ) From 604b6cc0dcb1990a10c92432a857f0f6fbe3d8e4 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Wed, 3 Jan 2024 12:10:02 +0100 Subject: [PATCH 10/19] fix dashboard --- dashboard/velocity/main.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/dashboard/velocity/main.py b/dashboard/velocity/main.py index 8cdc959..11f362b 100644 --- a/dashboard/velocity/main.py +++ b/dashboard/velocity/main.py @@ -15,9 +15,9 @@ from utils import vocab_size from data.midiencoder import VelocityEncoder -from data.dataset import MyTokenizedMidiDataset from data.maskedmidiencoder import MaskedMidiEncoder from data.multitokencoder import MultiMidiEncoder, MultiVelocityEncoder +from data.dataset import MyTokenizedMidiDataset, build_translation_dataset, build_AT_translation_dataset # Set the layout of the Streamlit page st.set_page_config(layout="wide", page_title="Velocity Transformer", page_icon=":musical_keyboard") @@ -102,6 +102,11 @@ def model_predictions_review( part_df = source_df[ids] part_dataset = midi_dataset.select(part_df.index.values) + if "dstart" in train_cfg.dataset.quantization: + translation_dataset = build_translation_dataset(part_dataset, train_cfg.dataset) + else: + translation_dataset = build_AT_translation_dataset(part_dataset, train_cfg.dataset) + random_seed = st.selectbox(label="random seed", options=range(20)) if "finetune" in train_cfg.train and train_cfg.train.finetune: @@ -128,10 +133,11 @@ def model_predictions_review( ) dataset = MyTokenizedMidiDataset( - dataset=part_dataset, + dataset=translation_dataset, dataset_cfg=train_cfg.dataset, encoder=tokenizer, ) + start_token_id: int = dataset.encoder.token_to_id[""] pad_token_id: int = dataset.encoder.token_to_id[""] if train_cfg.model_name == "T5": @@ -149,7 +155,7 @@ def model_predictions_review( ) model = T5ForConditionalGeneration(config) - elif train_cfg.model_name == "BART": + else: config = BartConfig( vocab_size=vocab_size(train_cfg), decoder_start_token_id=start_token_id, @@ -172,25 +178,6 @@ def model_predictions_review( n_parameters: float = sum(p.numel() for p in model.parameters()) / 1e6 st.markdown(f"Model parameters: {n_parameters:.3f}M") - start_token_id: int = dataset.encoder.token_to_id[""] - pad_token_id: int = dataset.encoder.token_to_id[""] - config = T5Config( - vocab_size=vocab_size(train_cfg), - decoder_start_token_id=start_token_id, - pad_token_id=pad_token_id, - eos_token_id=pad_token_id, - use_cache=False, - d_model=train_cfg.model.d_model, - d_kv=train_cfg.model.d_kv, - d_ff=train_cfg.model.d_ff, - num_layers=train_cfg.model.num_layers, - num_heads=train_cfg.model.num_heads, - ) - - model = T5ForConditionalGeneration(config) - model.load_state_dict(checkpoint["model_state_dict"]) - model.eval().to(DEVICE) - n_parameters = sum(p.numel() for p in model.parameters()) / 1e6 st.markdown(f"Model parameters: {n_parameters:.3f}M") @@ -230,6 +217,7 @@ def model_predictions_review( pred_piece_df = true_piece.df.copy() # change untokenized velocities to model predictions + pred_piece_df = pred_piece_df.iloc[: len(generated_velocity)] pred_piece_df["velocity"] = generated_velocity pred_piece_df["velocity"] = pred_piece_df["velocity"].fillna(0) From 5b1061b2c987b8fe619a0de7f31464153daac3ca Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 9 Jan 2024 01:40:40 +0100 Subject: [PATCH 11/19] Masked encoder for bart --- configs/BARTdenoise.yaml | 2 +- data/maskedmidiencoder.py | 16 ++++++++++++++++ pipelines/BART/main.py | 4 ++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/configs/BARTdenoise.yaml b/configs/BARTdenoise.yaml index fee9100..7d3a59b 100644 --- a/configs/BARTdenoise.yaml +++ b/configs/BARTdenoise.yaml @@ -26,7 +26,7 @@ dataset: sequence_step: 2 quantization: - start: 50 + start: 100 duration: 5 velocity: 3 diff --git a/data/maskedmidiencoder.py b/data/maskedmidiencoder.py index b370282..8ed4e66 100644 --- a/data/maskedmidiencoder.py +++ b/data/maskedmidiencoder.py @@ -175,3 +175,19 @@ def decode(self, src_token_ids: torch.Tensor, tgt_token_ids: torch.Tensor): mask_ids_column[mask_ids] = True df["mask"] = mask_ids_column return df + + +class SingleMaskedNoteEncoder(MaskedNoteEncoder): + def __init__(self, base_encoder: MultiTokEncoder | MidiEncoder, masking_probability: float = 0.15): + super().__init__(base_encoder, masking_probability) + + def encode_record(self, record: dict) -> tuple[list[int], list[int]]: + """ + Encode record into src and tgt for unsupervised BART learning. + """ + src_tokens, tgt_tokens = self.mask_record(record) + + src = [self.token_to_id[token] for token in src_tokens] + tgt = [self.token_to_id[token] for token in tgt_tokens] + + return src, tgt diff --git a/pipelines/BART/main.py b/pipelines/BART/main.py index 5ff353e..457ce6c 100644 --- a/pipelines/BART/main.py +++ b/pipelines/BART/main.py @@ -7,7 +7,7 @@ from training_utils import train_model from data.dataset import MaskedMidiDataset, MyTokenizedMidiDataset from data.midiencoder import VelocityEncoder, QuantizedMidiEncoder -from data.maskedmidiencoder import MaskedMidiEncoder, MaskedNoteEncoder +from data.maskedmidiencoder import MaskedMidiEncoder, SingleMaskedNoteEncoder from data.multitokencoder import MultiMidiEncoder, MultiStartEncoder, MultiVelocityEncoder @@ -131,7 +131,7 @@ def create_masked_datasets( else: base_encoder = QuantizedMidiEncoder(cfg.dataset.quantization, cfg.time_quantization_method) if cfg.mask == "notes": - encoder = MaskedNoteEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) + encoder = SingleMaskedNoteEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) else: encoder = MaskedMidiEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) From 68ab61661f65132b3140c9586d2600b29ff2f8f0 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 9 Jan 2024 01:52:10 +0100 Subject: [PATCH 12/19] accuracy metric --- training_utils.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/training_utils.py b/training_utils.py index 5303516..83c552a 100644 --- a/training_utils.py +++ b/training_utils.py @@ -48,7 +48,7 @@ def collate_fn(batch): print(f"Epoch {epoch}", flush=True) # Train model for one epoch - t_loss, t_dist = train_epoch( + t_loss, t_dist, t_acc = train_epoch( dataloader=train_dataloader, val_dataloader=val_dataloader, model=model, @@ -64,7 +64,7 @@ def collate_fn(batch): print(f"Epoch {epoch} Validation", flush=True) model.eval() # Evaluate the model on validation set - v_loss, v_dist = val_epoch( + v_loss, v_dist, v_acc = val_epoch( dataloader=val_dataloader, model=model, device=cfg.device, @@ -86,8 +86,10 @@ def collate_fn(batch): { "val/loss_epoch": v_loss, "val/dist_epoch": v_dist, + "val/accuracy_epoch": v_acc, "train/loss_epoch": t_loss, "train/dist_epoch": t_dist, + "train/accuracy_epoch": t_acc, "epoch": epoch, } ) @@ -155,10 +157,13 @@ def train_epoch( log_frequency: int = 10, log: bool = False, device: str = "cpu", -) -> tuple[float, float]: +) -> tuple[float, float, float]: start: float = time.time() + total_loss: float = 0 total_dist: float = 0 + total_acc: float = 0 + tokens: int = 0 n_accum: int = 0 it: int = 0 @@ -188,6 +193,7 @@ def train_epoch( loss.backward() dist = calculate_average_distance(out_rearranged, target, pad_idx=pad_idx) + accuracy = (out_rearranged == target).sum() / n_tokens # Update the model parameters and optimizer gradients every `accum_iter` iterations if it % accum_iter == 0 or it == steps - 1: @@ -201,23 +207,25 @@ def train_epoch( total_loss += loss.item() tokens += n_tokens total_dist += dist + total_acc += accuracy # log metrics every log_frequency steps if it % log_frequency == 1: lr = optimizer.param_groups[0]["lr"] elapsed = time.time() - start tok_rate = tokens / elapsed + accuracy = (out_rearranged == target).sum() / n_tokens progress_bar.set_description( - f"Step: {it:6d}/{steps} | acc_step: {n_accum:3d} | loss: {loss_item:6.2f} | dist: {dist:6.2f}" - + f"| tps: {tok_rate:7.1f} | lr: {lr:6.1e}" + f"Step: {it:6d}/{steps} | acc_step: {n_accum:3d} | loss: {loss_item:6.2f} | dist: {dist:6.2f} " + + f"| acc: {accuracy:6.2f} | tps: {tok_rate:7.1f} | lr: {lr:6.1e}" ) # log the loss each to Weights and Biases if log: - wandb.log({"train/loss_step": loss.item(), "train/dist_step": dist}) + wandb.log({"train/loss_step": loss.item(), "train/dist_step": dist, "train/accuracy_step": accuracy}) if it % log_frequency * 200 == 1: - val_loss, val_dist = val_epoch( + val_loss, val_dist, val_acc = val_epoch( dataloader=val_dataloader, model=model, pad_idx=pad_idx, @@ -225,10 +233,10 @@ def train_epoch( device=device, ) if log: - wandb.log({"val/loss_step": val_loss, "val/dist_step": val_dist}) + wandb.log({"val/loss_step": val_loss, "val/dist_step": val_dist, "val/accuracy_step": val_acc}) # Return average loss over all tokens and updated train state - return total_loss / len(dataloader), total_dist / len(dataloader) + return total_loss / len(dataloader), total_dist / len(dataloader), total_acc / len(dataloader) @torch.no_grad() @@ -238,9 +246,10 @@ def val_epoch( pad_idx: int = 1, cls_idx: int = 0, device: str = "cpu", -) -> tuple[float, float]: +) -> tuple[float, float, float]: total_tokens: int = 0 total_loss: float = 0 + total_acc: float = 0 tokens: int = 0 total_dist: float = 0 @@ -267,6 +276,7 @@ def val_epoch( total_tokens += n_tokens tokens += n_tokens total_dist += calculate_average_distance(out_rearranged, target, pad_idx=pad_idx) + total_acc += (out_rearranged == target).sum() / n_tokens # Return average loss over all tokens and updated train state - return total_loss / len(dataloader), total_dist / len(dataloader) + return total_loss / len(dataloader), total_dist / len(dataloader), total_acc / len(dataloader) From 39f6afcfb62afb9dd38441366409d6e32eab795b Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 9 Jan 2024 02:20:12 +0100 Subject: [PATCH 13/19] accuracy metric --- training_utils.py | 6 +++--- utils.py | 10 ++++++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/training_utils.py b/training_utils.py index 83c552a..9552896 100644 --- a/training_utils.py +++ b/training_utils.py @@ -9,8 +9,8 @@ from omegaconf import OmegaConf, DictConfig import wandb -from utils import calculate_average_distance from data.dataset import MyTokenizedMidiDataset +from utils import calculate_accuracy, calculate_average_distance def train_model( @@ -193,7 +193,7 @@ def train_epoch( loss.backward() dist = calculate_average_distance(out_rearranged, target, pad_idx=pad_idx) - accuracy = (out_rearranged == target).sum() / n_tokens + accuracy = calculate_accuracy(out_rearranged, target, pad_idx=pad_idx) # Update the model parameters and optimizer gradients every `accum_iter` iterations if it % accum_iter == 0 or it == steps - 1: @@ -276,7 +276,7 @@ def val_epoch( total_tokens += n_tokens tokens += n_tokens total_dist += calculate_average_distance(out_rearranged, target, pad_idx=pad_idx) - total_acc += (out_rearranged == target).sum() / n_tokens + total_acc += calculate_accuracy(out_rearranged, target, pad_idx=pad_idx) # Return average loss over all tokens and updated train state return total_loss / len(dataloader), total_dist / len(dataloader), total_acc / len(dataloader) diff --git a/utils.py b/utils.py index c7df3f6..5ee0b6f 100644 --- a/utils.py +++ b/utils.py @@ -119,6 +119,16 @@ def calculate_average_distance(out: torch.Tensor, tgt: torch.Tensor, pad_idx: in return torch.dist(labels, tgt.to(float), p=1) / len(labels) +def calculate_accuracy(out: torch.Tensor, tgt: torch.Tensor, pad_idx: int = 1) -> torch.Tensor: + labels = out.argmax(1).to(float) + tgt[tgt == -100] = pad_idx + tgt = tgt[tgt != pad_idx] + labels = labels[: len(tgt)] + labels[tgt == pad_idx] = pad_idx + + return (labels == tgt).sum() / len(labels) + + def learning_rate_schedule(step: int, warmup: int): return 1 / sqrt(max(step, warmup)) From edc8f836eff233e45d2691391a8c8f4969b7ae01 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 9 Jan 2024 08:36:46 +0100 Subject: [PATCH 14/19] fix accuracy calculation --- training_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/training_utils.py b/training_utils.py index 9552896..abf258f 100644 --- a/training_utils.py +++ b/training_utils.py @@ -214,7 +214,6 @@ def train_epoch( lr = optimizer.param_groups[0]["lr"] elapsed = time.time() - start tok_rate = tokens / elapsed - accuracy = (out_rearranged == target).sum() / n_tokens progress_bar.set_description( f"Step: {it:6d}/{steps} | acc_step: {n_accum:3d} | loss: {loss_item:6.2f} | dist: {dist:6.2f} " + f"| acc: {accuracy:6.2f} | tps: {tok_rate:7.1f} | lr: {lr:6.1e}" From 2ebc8a28040107b2a02ffe95cf943f051faec803 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Tue, 9 Jan 2024 23:24:23 +0100 Subject: [PATCH 15/19] fixed SingleMaskedNoteEncoder, new encoder on dashboard --- dashboard/denoise/main.py | 15 +++++++++++++-- data/maskedmidiencoder.py | 27 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/dashboard/denoise/main.py b/dashboard/denoise/main.py index 5bf1784..79f869d 100644 --- a/dashboard/denoise/main.py +++ b/dashboard/denoise/main.py @@ -17,7 +17,7 @@ from data.midiencoder import QuantizedMidiEncoder from data.multitokencoder import MultiMidiEncoder from data.quantizer import MidiQuantizer, MidiATQuantizer -from data.maskedmidiencoder import MaskedMidiEncoder, MaskedNoteEncoder +from data.maskedmidiencoder import MaskedMidiEncoder, MaskedNoteEncoder, SingleMaskedNoteEncoder from data.dataset import MaskedMidiDataset, build_translation_dataset, build_AT_translation_dataset # Set the layout of the Streamlit page @@ -118,7 +118,16 @@ def create_dataset(base_dataset: Dataset, train_cfg: DictConfig): if "mask" in train_cfg: if train_cfg.mask == "notes": - encoder = MaskedNoteEncoder(base_encoder=base_tokenizer, masking_probability=train_cfg.masking_probability) + if train_cfg.model_name == "T5": + encoder = MaskedNoteEncoder( + base_encoder=base_tokenizer, + masking_probability=train_cfg.masking_probability, + ) + else: + encoder = SingleMaskedNoteEncoder( + base_encoder=base_tokenizer, + masking_probability=train_cfg.masking_probability, + ) else: encoder = MaskedMidiEncoder(base_encoder=base_tokenizer, masking_probability=train_cfg.masking_probability) else: @@ -246,6 +255,8 @@ def model_predictions_review( generated_df = pd.DataFrame([[23.0, 1.0, 1.0, 1.0, 1.0]], columns=midi_columns) generated_df["mask"] = [False] pred_piece = MidiPiece(generated_df) + unmasked_notes_df = generated_df[generated_df["mask"]] + unmasked_notes_piece = MidiPiece(unmasked_notes_df) pred_piece.source = true_piece.source.copy() diff --git a/data/maskedmidiencoder.py b/data/maskedmidiencoder.py index 8ed4e66..5cea5dd 100644 --- a/data/maskedmidiencoder.py +++ b/data/maskedmidiencoder.py @@ -181,6 +181,22 @@ class SingleMaskedNoteEncoder(MaskedNoteEncoder): def __init__(self, base_encoder: MultiTokEncoder | MidiEncoder, masking_probability: float = 0.15): super().__init__(base_encoder, masking_probability) + def mask_record(self, record: dict) -> tuple[np.ndarray, np.ndarray]: + """ + Mask record and return tuple of src and tgt tokens with masks. + """ + src_tokens = self.base_encoder.tokenize_src(record) + tgt_tokens = src_tokens.copy() + num_masks = self.masking_probability * len(src_tokens) + + ids_to_mask = np.random.randint(len(src_tokens), size=int(num_masks)) + np_src = np.array(src_tokens) + np_tgt = np.array(tgt_tokens) + + np_src[ids_to_mask] = "" + + return np_src, np_tgt + def encode_record(self, record: dict) -> tuple[list[int], list[int]]: """ Encode record into src and tgt for unsupervised BART learning. @@ -191,3 +207,14 @@ def encode_record(self, record: dict) -> tuple[list[int], list[int]]: tgt = [self.token_to_id[token] for token in tgt_tokens] return src, tgt + + def decode(self, src_token_ids: torch.Tensor, tgt_token_ids: torch.Tensor) -> pd.DataFrame: + tokens: list[str] = [self.vocab[token_id] for token_id in tgt_token_ids] + mask_ids = src_token_ids == self.token_to_id[""] + + df: pd.DataFrame = self.base_encoder.untokenize_src(tokens) + mask_ids_column = np.zeros_like(df["pitch"], dtype=bool) + mask_ids_column[mask_ids] = True + df["mask"] = mask_ids_column + + return df From 6f63b92760f397d0a04c6f97b88ab2eaa553f276 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Wed, 10 Jan 2024 23:10:49 +0100 Subject: [PATCH 16/19] fix encoder --- data/maskedmidiencoder.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/data/maskedmidiencoder.py b/data/maskedmidiencoder.py index 5cea5dd..72f9e56 100644 --- a/data/maskedmidiencoder.py +++ b/data/maskedmidiencoder.py @@ -185,13 +185,14 @@ def mask_record(self, record: dict) -> tuple[np.ndarray, np.ndarray]: """ Mask record and return tuple of src and tgt tokens with masks. """ - src_tokens = self.base_encoder.tokenize_src(record) - tgt_tokens = src_tokens.copy() - num_masks = self.masking_probability * len(src_tokens) + src_tokens: list[str] = self.base_encoder.tokenize_src(record) + tgt_tokens: list[str] = src_tokens.copy() + num_masks: float = self.masking_probability * len(src_tokens) // 3 - ids_to_mask = np.random.randint(len(src_tokens), size=int(num_masks)) - np_src = np.array(src_tokens) - np_tgt = np.array(tgt_tokens) + ids_to_mask: np.ndarray[int] = np.random.randint(len(src_tokens) // 3, size=int(num_masks)) * 3 + ids_to_mask = np.concatenate([ids_to_mask, ids_to_mask + 1, ids_to_mask + 2]) + np_src: np.ndarray[int] = np.array(src_tokens) + np_tgt: np.ndarray[int] = np.array(tgt_tokens) np_src[ids_to_mask] = "" From 49a94882597c256fe84b6ca7005efc4e4b538e9d Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Thu, 11 Jan 2024 14:48:49 +0100 Subject: [PATCH 17/19] SingeMaskedMidiEncoder for BART training --- configs/BARTdenoise-dstart.yaml | 48 +++++++++++++++++++++++++++++++++ data/maskedmidiencoder.py | 43 +++++++++++++++++++++++++++++ pipelines/BART/main.py | 6 ++--- 3 files changed, 94 insertions(+), 3 deletions(-) create mode 100644 configs/BARTdenoise-dstart.yaml diff --git a/configs/BARTdenoise-dstart.yaml b/configs/BARTdenoise-dstart.yaml new file mode 100644 index 0000000..6da05f0 --- /dev/null +++ b/configs/BARTdenoise-dstart.yaml @@ -0,0 +1,48 @@ +train: + num_epochs: 5 + accum_iter: 5 + batch_size: 2 + base_lr: 3e-5 + warmup: 4000 + finetune: False + +model_name: T5 +dataset_name: 'roszcz/maestro-v1-sustain' +target: denoise +seed: 26 + +overfit: False + +tokens_per_note: single +time_quantization_method: dstart +masking_probability: 0.3 +mask: tokens + +encoder: velocity +time_bins: 100 + +dataset: + sequence_len: 128 + sequence_step: 42 + + quantization: + dstart: 8 + duration: 8 + velocity: 3 + +device: "cuda:0" + +log: True +log_frequency: 10 +run_name: midi-bart-${now:%Y-%m-%d-%H-%M} +project: "midi-bart" + +pre_defined_model: null +model: + encoder_layers: 6 + encoder_ffn_dim: 2048 + encoder_attention_heads: 8 + decoder_layers: 6 + decoder_ffn_dim: 2048 + decoder_attention_heads: 8 + d_model: 512 diff --git a/data/maskedmidiencoder.py b/data/maskedmidiencoder.py index 72f9e56..977f4ec 100644 --- a/data/maskedmidiencoder.py +++ b/data/maskedmidiencoder.py @@ -219,3 +219,46 @@ def decode(self, src_token_ids: torch.Tensor, tgt_token_ids: torch.Tensor) -> pd df["mask"] = mask_ids_column return df + + +class SingleMaskedMidiEncoder(MaskedMidiEncoder): + def __init__(self, base_encoder: MultiTokEncoder | MidiEncoder, masking_probability: float = 0.15): + super().__init__(base_encoder, masking_probability) + + def mask_record(self, record: dict) -> tuple[np.ndarray, np.ndarray]: + """ + Mask record and return tuple of src and tgt tokens with masks. + """ + src_tokens = self.base_encoder.tokenize_src(record) + tgt_tokens = src_tokens.copy() + num_masks = self.masking_probability * len(src_tokens) + + ids_to_mask = np.random.randint(len(src_tokens), size=int(num_masks)) + np_src = np.array(src_tokens) + np_tgt = np.array(tgt_tokens) + + np_src[ids_to_mask] = "" + + return np_src, np_tgt + + def encode_record(self, record: dict) -> tuple[list[int], list[int]]: + """ + Encode record into src and tgt for unsupervised T5 learning. + """ + src_tokens, tgt_tokens = self.mask_record(record) + + src = [self.token_to_id[token] for token in src_tokens] + tgt = [self.token_to_id[token] for token in tgt_tokens] + + return src, tgt + + def decode(self, src_token_ids: torch.Tensor, tgt_token_ids: torch.Tensor) -> pd.DataFrame: + tokens: list[str] = [self.vocab[token_id] for token_id in tgt_token_ids] + mask_ids = src_token_ids == self.token_to_id[""] + + df: pd.DataFrame = self.base_encoder.untokenize_src(tokens) + mask_ids_column = np.zeros_like(df["pitch"], dtype=bool) + mask_ids_column[mask_ids] = True + df["mask"] = mask_ids_column + + return df diff --git a/pipelines/BART/main.py b/pipelines/BART/main.py index 457ce6c..347eae0 100644 --- a/pipelines/BART/main.py +++ b/pipelines/BART/main.py @@ -7,7 +7,7 @@ from training_utils import train_model from data.dataset import MaskedMidiDataset, MyTokenizedMidiDataset from data.midiencoder import VelocityEncoder, QuantizedMidiEncoder -from data.maskedmidiencoder import MaskedMidiEncoder, SingleMaskedNoteEncoder +from data.maskedmidiencoder import SingleMaskedMidiEncoder, SingleMaskedNoteEncoder from data.multitokencoder import MultiMidiEncoder, MultiStartEncoder, MultiVelocityEncoder @@ -133,7 +133,7 @@ def create_masked_datasets( if cfg.mask == "notes": encoder = SingleMaskedNoteEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) else: - encoder = MaskedMidiEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) + encoder = SingleMaskedMidiEncoder(base_encoder=base_encoder, masking_probability=cfg.masking_probability) train_dataset = MaskedMidiDataset( dataset=train_translation_dataset, @@ -161,7 +161,7 @@ def create_datasets_finetune( quantization_cfg=cfg.dataset.quantization, time_quantization_method=cfg.time_quantization_method, ) - pretraining_tokenizer = MaskedMidiEncoder( + pretraining_tokenizer = SingleMaskedMidiEncoder( base_encoder=tokenizer, ) # use the same token ids as used during pre-training From 5d822a867066d3cf6e2b08354eb178394ea36e28 Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Sun, 14 Jan 2024 13:52:40 +0100 Subject: [PATCH 18/19] fix keys on dashboard --- dashboard/velocity/main.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dashboard/velocity/main.py b/dashboard/velocity/main.py index 11f362b..4c9460f 100644 --- a/dashboard/velocity/main.py +++ b/dashboard/velocity/main.py @@ -193,6 +193,7 @@ def model_predictions_review( # predict velocities and get src, tgt and model output print("Making predictions ...") + key = np.random.random() for record_id in idxs: # Numpy to int :( record = dataset.get_complete_record(int(record_id)) @@ -233,13 +234,14 @@ def model_predictions_review( # Unchanged fig = ff.view.draw_pianoroll_with_velocities(true_piece) st.pyplot(fig) - from_fortepyan(true_piece) + from_fortepyan(true_piece, key=key) with cols[1]: # Predicted fig = ff.view.draw_pianoroll_with_velocities(pred_piece) st.pyplot(fig) - from_fortepyan(pred_piece) + from_fortepyan(pred_piece, key=key+1) + key += 2 if __name__ == "__main__": From 8fc5aa4dcbae0bdf90660257149b8a438e0a040f Mon Sep 17 00:00:00 2001 From: WojciechMat Date: Sun, 14 Jan 2024 14:02:20 +0100 Subject: [PATCH 19/19] update config name --- configs/BARTdenoise-dstart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/BARTdenoise-dstart.yaml b/configs/BARTdenoise-dstart.yaml index 6da05f0..41421d2 100644 --- a/configs/BARTdenoise-dstart.yaml +++ b/configs/BARTdenoise-dstart.yaml @@ -6,7 +6,7 @@ train: warmup: 4000 finetune: False -model_name: T5 +model_name: BART dataset_name: 'roszcz/maestro-v1-sustain' target: denoise seed: 26