diff --git a/Generative Models/Music_Accompaniment_Generator/.DS_Store b/Generative Models/Music_Accompaniment_Generator/.DS_Store new file mode 100644 index 00000000..b021224e Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/.DS_Store differ diff --git a/Generative Models/Music_Accompaniment_Generator/.gitattributes b/Generative Models/Music_Accompaniment_Generator/.gitattributes new file mode 100644 index 00000000..dfe07704 --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/Generative Models/Music_Accompaniment_Generator/data_load.py b/Generative Models/Music_Accompaniment_Generator/data_load.py new file mode 100644 index 00000000..c8cb9402 --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/data_load.py @@ -0,0 +1,72 @@ +import random + +import torch +from miditok import REMI, TokenizerConfig +from utils import midi_to_array +from tqdm import tqdm +from train_parameters import max_len + +# Our parameters +TOKENIZER_PARAMS = { + "pitch_range": (21, 109), + "beat_res": {(0, 4): 8, (4, 12): 4}, + "num_velocities": 32, + "special_tokens": ["PAD", "BOS", "EOS"], + "use_chords": True, + "use_rests": False, + "use_tempos": True, + "use_programs": True, + "num_tempos": 191, + "tempo_range": (60, 250), + "program_changes": True, + "programs": [-1, 0, 24, 27, 30, 33, 36], +} +config = TokenizerConfig(**TOKENIZER_PARAMS) + +# Creates the tokenizer +tokenizer = REMI(config) + +word2idx = tokenizer.vocab +idx2word = {idx: word for idx, word in enumerate(word2idx)} +vocab_len = len(word2idx) + + +def data_load(data_type, split, data_len, x_folder, y_folder): + print("---Data Load Start!---") + x = [] + y = [] + data_range = (0, 1) + if data_type == "train": + data_range = range(0, split) + if data_type == "test": + data_range = range(split, data_len) + for i in tqdm(data_range, desc="Data Loading...", unit="data"): + x.append( + midi_to_array( + tokenizer=tokenizer, + midifile=f"{x_folder}{i}.mid", + max_len=max_len)) + y.append( + midi_to_array( + tokenizer=tokenizer, + midifile=f"{y_folder}{i}.mid", + max_len=max_len)) + x = torch.tensor(x) + y = torch.tensor(y) + print("---Data Load Completed!---") + return x, y + + +def get_batch_indices(total_length, batch_size): + assert (batch_size <= + total_length), ('Batch size is large than total data length.' + 'Check your data or change batch size.') + current_index = 0 + indexes = [i for i in range(total_length)] + random.shuffle(indexes) + while True: + if current_index + batch_size >= total_length: + yield indexes[current_index:total_length], current_index + break + yield indexes[current_index:current_index + batch_size], current_index + current_index += batch_size diff --git a/Generative Models/Music_Accompaniment_Generator/midi_generate.py b/Generative Models/Music_Accompaniment_Generator/midi_generate.py new file mode 100644 index 00000000..9268191f --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/midi_generate.py @@ -0,0 +1,29 @@ +from utils import merge_midi_tracks, generate_midi_v2 +from data_load import tokenizer +from transformer import Transformer +from data_load import vocab_len +import torch + +from train_parameters import (max_len, d_model, d_ff, n_layers, + heads, dropout_rate, PAD_ID) + +if __name__ == '__main__': + instruments = ['Drum', 'Bass', 'Guitar', 'Piano'] + src_midi = "./HMuseData/Melody2Drum/Melody/69.mid" + for instrument in instruments: + print(f"-----Loading {instrument} model-----") + model = Transformer(src_vocab_size=vocab_len, dst_vocab_size=vocab_len, pad_idx=PAD_ID, d_model=d_model, + d_ff=d_ff, n_layers=n_layers, heads=heads, dropout=dropout_rate, max_seq_len=max_len) + model_path = f"./models/model_{instrument}/model_{instrument}2.pth" + model.load_state_dict( + torch.load( + model_path, + map_location=torch.device('mps'))) + print(f"-----{instrument} model loaded!-----") + print(f"-----Generating {instrument} track-----") + generate_midi_v2(model=model, tokenizer=tokenizer, src_midi=src_midi, max_len=max_len, PAD_ID=PAD_ID, + tgt_midi=f"./MIDIs/output_MIDI/{instrument}_track.mid") + print(f"-----{instrument} track generated!-----") + merge_midi_tracks(src_midi, "./MIDIs/output_MIDI/Drum_track.mid", "./MIDIs/output_MIDI/Bass_track.mid", + "./MIDIs/output_MIDI/Guitar_track.mid", "./MIDIs/output_MIDI/Piano_track.mid", + tgt_dir="./MIDIs/output_MIDI/generated_midi.mid") diff --git a/Generative Models/Music_Accompaniment_Generator/readme/MIDI Music Accompaniment Generator.md b/Generative Models/Music_Accompaniment_Generator/readme/MIDI Music Accompaniment Generator.md new file mode 100644 index 00000000..f04d5805 --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/readme/MIDI Music Accompaniment Generator.md @@ -0,0 +1,79 @@ +# MIDI Music Accompaniment Generator + +​ In this project, I implemented an automatic music accompaniment model. This model can take a single-track melody MIDI file as input and output a corresponding accompaniment MIDI file featuring other instruments such as guitar, bass, and drums. To accomplish the above tasks, I also completed numerous additional efforts, which I will elaborate on below. + +## Rule-based music score alignment algorithm + +​ Among the numerous MIDI files collected in various ways for training purposes, a significant portion consists of "real" MIDI files generated from instrumental performances. The notes and events in these real performances contain many factors, such as the performer's emotions, and are often dynamic and uncertain. Therefore, there are timing errors in the performance beats, making it difficult to align precisely with the time axis corresponding to the beats. This results in the events recorded in these MIDI files being advanced or delayed relative to the standard musical score on the time axis, further generating inaccurate notes during the music conversion process. Specifically, this manifests in the score as the appearance of numerous sixteenth notes, thirty-second notes, dotted notes, and so forth. This will make it difficult for the model to learn the correct characteristics of the music and significantly reduce learning efficiency. To address this issue, I have developed a rule-based music score alignment algorithm, with the following effects: + +​ Before the algorithm: + +![](align_1.png) + +​ After the algorithm: + +![align_2](align_2.png) + +## Tokenization + +​ Convert MIDI files into tensor arrays for input into the model.For an example: + +​ Given a MIDI music file: + +![midi_music](midi_music.png) + +​ Tokenize and turn it into tensor array: + +![tokenization](tokenization.jpg) + +## Build the model + +​ Build a model based on transformer.The structure of the model is shown in the diagram below. + +![structure](structure.jpg) + +## Train and Evaluate + +​ Train the model and then evaluate its performance. The specific results are shown in the table below.The values of the hyperparameters adjusted during the training process are also presented below. + +​ Hyperparameters: + +| Hyperparameters | Value | +| :-------------: | :----: | +| lr | 0.0001 | +| d_model | 512 | +| d_ff | 2048 | +| n_layers | 6 | +| heads | 8 | +| dropout_rate | 0.2 | +| n_epochs | 60 | + +​ Train accuracy: + +| Model | Accuracy | +| :---------------: | :------: | +| DrumTransformer | 88.7% | +| PianoTransformer | 91.6% | +| GuitarTransformer | 85.3% | +| BassTransformer | 89.7% | + +​ Test accuracy: + +| Model | Accuracy | +| :---------------: | :------: | +| DrumTransformer | 75.3% | +| PianoTransformer | 71.5% | +| GuitarTransformer | 64.3% | +| BassTransformer | 67.1% | + +## Result + +​ Finally, let's demonstrate the effectiveness of the model. + +​ For a given main melody MIDI music file: + +![main_melody](main_melody.png) + +​ The model can generate the auto_accompaniment MIDI file: + +![accompaniment](accompaniment.png) \ No newline at end of file diff --git a/Generative Models/Music_Accompaniment_Generator/readme/accompaniment.png b/Generative Models/Music_Accompaniment_Generator/readme/accompaniment.png new file mode 100644 index 00000000..430b0665 Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/accompaniment.png differ diff --git a/Generative Models/Music_Accompaniment_Generator/readme/align_1.png b/Generative Models/Music_Accompaniment_Generator/readme/align_1.png new file mode 100644 index 00000000..d4b6b5f5 Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/align_1.png differ diff --git a/Generative Models/Music_Accompaniment_Generator/readme/align_2.png b/Generative Models/Music_Accompaniment_Generator/readme/align_2.png new file mode 100644 index 00000000..ef7e17eb Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/align_2.png differ diff --git a/Generative Models/Music_Accompaniment_Generator/readme/main_melody.png b/Generative Models/Music_Accompaniment_Generator/readme/main_melody.png new file mode 100644 index 00000000..27508c7f Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/main_melody.png differ diff --git a/Generative Models/Music_Accompaniment_Generator/readme/midi_music.png b/Generative Models/Music_Accompaniment_Generator/readme/midi_music.png new file mode 100644 index 00000000..e365684d Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/midi_music.png differ diff --git a/Generative Models/Music_Accompaniment_Generator/readme/structure.jpg b/Generative Models/Music_Accompaniment_Generator/readme/structure.jpg new file mode 100644 index 00000000..2accd488 Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/structure.jpg differ diff --git a/Generative Models/Music_Accompaniment_Generator/readme/tokenization.jpg b/Generative Models/Music_Accompaniment_Generator/readme/tokenization.jpg new file mode 100644 index 00000000..cbfa6ff7 Binary files /dev/null and b/Generative Models/Music_Accompaniment_Generator/readme/tokenization.jpg differ diff --git a/Generative Models/Music_Accompaniment_Generator/requirements.txt b/Generative Models/Music_Accompaniment_Generator/requirements.txt new file mode 100644 index 00000000..2cde7c1c --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/requirements.txt @@ -0,0 +1,5 @@ +mido==1.2.10 +torch==2.0.0 +miditok==3.0.1 +tqdm==4.62.3 +pretty_midi==0.2.10 \ No newline at end of file diff --git a/Generative Models/Music_Accompaniment_Generator/train.py b/Generative Models/Music_Accompaniment_Generator/train.py new file mode 100644 index 00000000..b47b744c --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/train.py @@ -0,0 +1,24 @@ +from transformer import Transformer +from data_load import vocab_len +from train_model import train_model +from train_parameters import (max_len, batch_size, lr, d_model, d_ff, n_layers, + heads, dropout_rate, n_epochs, PAD_ID, device, + print_interval, data_split_rate, len_Dataset) + + +if __name__ == '__main__': + instruments = ['Drum', 'Bass', 'Guitar', 'Piano'] + for instrument in instruments: + model = Transformer(src_vocab_size=vocab_len, dst_vocab_size=vocab_len, pad_idx=PAD_ID, d_model=d_model, + d_ff=d_ff, n_layers=n_layers, heads=heads, dropout=dropout_rate, max_seq_len=max_len) + train_model( + model=model, + data_split_rate=data_split_rate, + data_len=len_Dataset[instrument], + batch_size=batch_size, + lr=lr, + n_epochs=n_epochs, + PAD_ID=PAD_ID, + device=device, + print_interval=print_interval, + instrument=instrument) diff --git a/Generative Models/Music_Accompaniment_Generator/train_model.py b/Generative Models/Music_Accompaniment_Generator/train_model.py new file mode 100644 index 00000000..0019a45a --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/train_model.py @@ -0,0 +1,62 @@ +from data_load import get_batch_indices, data_load +from train_parameters import max_len +from tqdm import tqdm +import torch +from torch import nn +import time +import os + + +def train_model(model, data_split_rate, data_len, batch_size, lr, + n_epochs, PAD_ID, device, print_interval, instrument): + print(f"--------Train Model For {instrument} Start!--------") + x_folder = f"./HMuseData/Melody2{instrument}/Melody/" + y_folder = f"./HMuseData/Melody2{instrument}/{instrument}/" + split = round(data_len * data_split_rate) + x, y = data_load(data_type="train", split=split, data_len=data_len, + x_folder=x_folder, y_folder=y_folder) + model.to(device) + optimizer = torch.optim.Adam(model.parameters(), lr) + criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID) + tic = time.time() + counter = 0 + for epoch in range(n_epochs): + for index, _ in tqdm(get_batch_indices( + len(x), batch_size), desc="Processing", unit="batches"): + x_batch = torch.LongTensor(x[index]).to(device) + y_batch = torch.LongTensor(y[index]).to(device) + y_input = y_batch[:, :-1] + y_label = y_batch[:, 1:] + y_hat = model(x_batch, y_input) + + y_label_mask = y_label != PAD_ID + preds = torch.argmax(y_hat, -1) + correct = preds == y_label + acc = torch.sum(y_label_mask * correct) / torch.sum(y_label_mask) + + n, seq_len = y_label.shape + y_hat = torch.reshape(y_hat, (n * seq_len, -1)) + y_label = torch.reshape(y_label, (n * seq_len, )) + loss = criterion(y_hat, y_label) + + optimizer.zero_grad() + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1) + optimizer.step() + + if counter % print_interval == 0: + toc = time.time() + interval = toc - tic + minutes = int(interval // 60) + seconds = int(interval % 60) + print(f'{counter:08d} {minutes:02d}:{seconds:02d}' + f' loss: {loss.item()} acc: {acc.item()}') + counter += 1 + + model_path = f"models/model_{instrument}/" + os.makedirs(model_path, exist_ok=True) + model_name = f"{model_path}model_{instrument}_{max_len}.pth" + torch.save(model.state_dict(), model_name) + + print(f'Model saved to {model_name}') + print(f"--------Train Model For {instrument} Completed!--------") diff --git a/Generative Models/Music_Accompaniment_Generator/train_parameters.py b/Generative Models/Music_Accompaniment_Generator/train_parameters.py new file mode 100644 index 00000000..073a78a7 --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/train_parameters.py @@ -0,0 +1,18 @@ +batch_size = 16 +lr = 0.0001 +d_model = 512 +d_ff = 2048 +n_layers = 6 +heads = 8 +dropout_rate = 0.2 +n_epochs = 60 +PAD_ID = 0 +device = "mps" +# device = "cuda:0" +print_interval = 100 +max_len = 750 +data_split_rate = 0.99 +len_Dataset = {'Drum': 18621, + 'Bass': 14316, + 'Guitar': 20037, + 'Piano': 11684} diff --git a/Generative Models/Music_Accompaniment_Generator/transformer.py b/Generative Models/Music_Accompaniment_Generator/transformer.py new file mode 100644 index 00000000..61d9c8be --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/transformer.py @@ -0,0 +1,290 @@ +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as f + +MY_INF = 1e12 + + +class PositionalEncoding(nn.Module): + + def __init__(self, d_model: int, max_seq_len: int): + super().__init__() + + # Assume d_model is an even number for convenience + assert d_model % 2 == 0 + + i_seq = torch.linspace(0, max_seq_len - 1, max_seq_len) + j_seq = torch.linspace(0, d_model - 2, d_model // 2) + pos, two_i = torch.meshgrid(i_seq, j_seq) + pe_2i = torch.sin(pos / 10000**(two_i / d_model)) + pe_2i_1 = torch.cos(pos / 10000**(two_i / d_model)) + pe = torch.stack((pe_2i, pe_2i_1), 2).reshape(1, max_seq_len, d_model) + + self.register_buffer('pe', pe, False) + + def forward(self, x: torch.Tensor): + n, seq_len, d_model = x.shape + pe: torch.Tensor = self.pe + assert seq_len <= pe.shape[1] + assert d_model == pe.shape[2] + rescaled_x = x * d_model**0.5 + return rescaled_x + pe[:, 0:seq_len, :] + + +def attention(q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + mask: Optional[torch.Tensor] = None): + """ + Note: The dtype of mask must be bool + """ + # q shape: [n, heads, q_len, d_k] + # k shape: [n, heads, k_len, d_k] + # v shape: [n, heads, k_len, d_v] + assert q.shape[-1] == k.shape[-1] + d_k = k.shape[-1] + # tmp shape: [n, heads, q_len, k_len] + tmp = torch.matmul(q, k.transpose(-2, -1)) / d_k**0.5 + if mask is not None: + tmp.masked_fill_(mask, -MY_INF) + tmp = f.softmax(tmp, -1) + # tmp shape: [n, heads, q_len, d_v] + tmp = torch.matmul(tmp, v) + return tmp + + +class MultiHeadAttention(nn.Module): + + def __init__(self, heads: int, d_model: int, dropout: float = 0.1): + super().__init__() + + assert d_model % heads == 0 + # dk == dv + self.d_k = d_model // heads + self.heads = heads + self.d_model = d_model + self.q = nn.Linear(d_model, d_model) + self.k = nn.Linear(d_model, d_model) + self.v = nn.Linear(d_model, d_model) + self.out = nn.Linear(d_model, d_model) + self.dropout = nn.Dropout(dropout) + + def forward(self, + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + mask: Optional[torch.Tensor] = None): + # batch should be same + assert q.shape[0] == k.shape[0] + assert q.shape[0] == v.shape[0] + # the sequence length of k and v should be aligned + assert k.shape[1] == v.shape[1] + + n, q_len = q.shape[0:2] + n, k_len = k.shape[0:2] + q_ = self.q(q).reshape(n, q_len, self.heads, self.d_k).transpose(1, 2) + k_ = self.k(k).reshape(n, k_len, self.heads, self.d_k).transpose(1, 2) + v_ = self.v(v).reshape(n, k_len, self.heads, self.d_k).transpose(1, 2) + + attention_res = attention(q_, k_, v_, mask) + concat_res = attention_res.transpose(1, 2).reshape( + n, q_len, self.d_model) + concat_res = self.dropout(concat_res) + + output = self.out(concat_res) + return output + + +class FeedForward(nn.Module): + + def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1): + super().__init__() + self.layer1 = nn.Linear(d_model, d_ff) + self.dropout = nn.Dropout(dropout) + self.layer2 = nn.Linear(d_ff, d_model) + + def forward(self, x): + x = self.layer1(x) + x = self.dropout(f.relu(x)) + x = self.layer2(x) + return x + + +class EncoderLayer(nn.Module): + + def __init__(self, + heads: int, + d_model: int, + d_ff: int, + dropout: float = 0.1): + super().__init__() + self.self_attention = MultiHeadAttention(heads, d_model, dropout) + self.ffn = FeedForward(d_model, d_ff, dropout) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + def forward(self, x, src_mask: Optional[torch.Tensor] = None): + tmp = self.self_attention(x, x, x, src_mask) + tmp = self.dropout1(tmp) + x = self.norm1(x + tmp) + tmp = self.ffn(x) + tmp = self.dropout2(tmp) + x = self.norm2(x + tmp) + return x + + +class DecoderLayer(nn.Module): + + def __init__(self, + heads: int, + d_model: int, + d_ff: int, + dropout: float = 0.1): + super().__init__() + self.self_attention = MultiHeadAttention(heads, d_model, dropout) + self.attention = MultiHeadAttention(heads, d_model, dropout) + self.ffn = FeedForward(d_model, d_ff, dropout) + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + def forward(self, + x, + encoder_kv: torch.Tensor, + dst_mask: Optional[torch.Tensor] = None, + src_dst_mask: Optional[torch.Tensor] = None): + tmp = self.self_attention(x, x, x, dst_mask) + tmp = self.dropout1(tmp) + x = self.norm1(x + tmp) + tmp = self.attention(x, encoder_kv, encoder_kv, src_dst_mask) + tmp = self.dropout2(tmp) + x = self.norm2(x + tmp) + tmp = self.ffn(x) + tmp = self.dropout3(tmp) + x = self.norm3(x + tmp) + return x + + +class Encoder(nn.Module): + + def __init__(self, + vocab_size: int, + pad_idx: int, + d_model: int, + d_ff: int, + n_layers: int, + heads: int, + dropout: float = 0.1, + max_seq_len: int = 120): + super().__init__() + self.embedding = nn.Embedding(vocab_size, d_model, pad_idx) + self.pe = PositionalEncoding(d_model, max_seq_len) + self.layers = [] + for i in range(n_layers): + self.layers.append(EncoderLayer(heads, d_model, d_ff, dropout)) + self.layers = nn.ModuleList(self.layers) + self.dropout = nn.Dropout(dropout) + + def forward(self, x, src_mask: Optional[torch.Tensor] = None): + x = self.embedding(x) + x = self.pe(x) + x = self.dropout(x) + for layer in self.layers: + x = layer(x, src_mask) + return x + + +class Decoder(nn.Module): + + def __init__(self, + vocab_size: int, + pad_idx: int, + d_model: int, + d_ff: int, + n_layers: int, + heads: int, + dropout: float = 0.1, + max_seq_len: int = 120): + super().__init__() + self.embedding = nn.Embedding(vocab_size, d_model, pad_idx) + self.pe = PositionalEncoding(d_model, max_seq_len) + self.layers = [] + for i in range(n_layers): + self.layers.append(DecoderLayer(heads, d_model, d_ff, dropout)) + self.layers = nn.Sequential(*self.layers) + self.dropout = nn.Dropout(dropout) + + def forward(self, + x, + encoder_kv, + dst_mask: Optional[torch.Tensor] = None, + src_dst_mask: Optional[torch.Tensor] = None): + x = self.embedding(x) + x = self.pe(x) + x = self.dropout(x) + for layer in self.layers: + x = layer(x, encoder_kv, dst_mask, src_dst_mask) + return x + + +class Transformer(nn.Module): + + def __init__(self, + src_vocab_size: int, + dst_vocab_size: int, + pad_idx: int, + d_model: int, + d_ff: int, + n_layers: int, + heads: int, + dropout: float = 0.1, + max_seq_len: int = 200): + super().__init__() + self.encoder = Encoder(src_vocab_size, pad_idx, d_model, d_ff, + n_layers, heads, dropout, max_seq_len) + self.decoder = Decoder(dst_vocab_size, pad_idx, d_model, d_ff, + n_layers, heads, dropout, max_seq_len) + self.pad_idx = pad_idx + self.output_layer = nn.Linear(d_model, dst_vocab_size) + + @staticmethod + def generate_mask(q_pad: torch.Tensor, + k_pad: torch.Tensor, + with_left_mask: bool = False): + # q_pad shape: [n, q_len] + # k_pad shape: [n, k_len] + # q_pad k_pad dtype: bool + assert q_pad.device == k_pad.device + n, q_len = q_pad.shape + n, k_len = k_pad.shape + + mask_shape = (n, 1, q_len, k_len) + if with_left_mask: + mask = 1 - torch.tril(torch.ones(mask_shape)) + else: + mask = torch.zeros(mask_shape) + mask = mask.to(q_pad.device) + for i in range(n): + mask[i, :, q_pad[i], :] = 1 + mask[i, :, :, k_pad[i]] = 1 + mask = mask.to(torch.bool) + return mask + + def forward(self, x, y): + + src_pad_mask = x == self.pad_idx + dst_pad_mask = y == self.pad_idx + src_mask = self.generate_mask(src_pad_mask, src_pad_mask, False) + dst_mask = self.generate_mask(dst_pad_mask, dst_pad_mask, True) + src_dst_mask = self.generate_mask(dst_pad_mask, src_pad_mask, False) + encoder_kv = self.encoder(x, src_mask) + res = self.decoder(y, encoder_kv, dst_mask, src_dst_mask) + res = self.output_layer(res) + return res diff --git a/Generative Models/Music_Accompaniment_Generator/utils.py b/Generative Models/Music_Accompaniment_Generator/utils.py new file mode 100644 index 00000000..a64ee626 --- /dev/null +++ b/Generative Models/Music_Accompaniment_Generator/utils.py @@ -0,0 +1,170 @@ +import pretty_midi +import mido +import torch +from mido import MidiFile, MidiTrack +from miditok import TokSequence + + +def midi_to_array(tokenizer, midifile, max_len): + tokens = tokenizer(midifile) + ids = [1] + tokens_len = len(tokens.ids) + if max_len <= tokens_len: + ids.extend(tokens.ids[0:(max_len - 5)]) + ids.append(2) + pad = [0] * (max_len - len(ids[0:(max_len - 3)])) + ids.extend(pad) + else: + ids.extend(tokens.ids) + ids.append(2) + pad = [0] * (max_len - len(ids)) + ids.extend(pad) + return ids[0:max_len] + + +def ids_to_tokens(array, idx2word): + tokens = [] + for i in range(0, len(array)): + tokens.append(idx2word[int(array[i])]) + return tokens + + +def tensor_to_midi(tensor, tokenizer, tgt_midi): + ids = [int(element) for element in tensor] + generated_tokens = TokSequence() + generated_tokens.ids = ids + tokenizer.complete_sequence(generated_tokens) + generated_midi = tokenizer(generated_tokens) + generated_midi.dump_midi(tgt_midi) + + +def find_closest_element(target, collection): + closest_element = None + min_difference = float('inf') # 初始设为正无穷大 + + for element in collection: + difference = abs(target - element) + + if difference < min_difference: + min_difference = difference + closest_element = element + + return closest_element + + +def tempo_adjustment(src_midi, tgt_midi, new_bpm): + midi_file = MidiFile(src_midi) + new_midi_file = MidiFile(ticks_per_beat=midi_file.ticks_per_beat) + for i, track in enumerate(midi_file.tracks): + new_track = MidiTrack() + new_midi_file.tracks.append(new_track) + for msg in track: + if msg.type == 'set_tempo': + microseconds_per_beat = mido.bpm2tempo(new_bpm) + new_msg = msg.copy(tempo=microseconds_per_beat) + new_track.append(new_msg) + else: + new_track.append(msg) + new_midi_file.save(tgt_midi) + + +def create_timestamp_array(max_time, standard_timestamp): + timestamp_array = [] + current_time = 0 + + while current_time <= max_time: + timestamp_array.append(round(current_time, 8)) + current_time += standard_timestamp + + return timestamp_array + + +def note_alignment_32(midi_file): + midi_data = pretty_midi.PrettyMIDI(midi_file) + tempo = round(midi_data.get_tempo_changes()[1][0]) + standard_timestamp = 60 / tempo / 8 + max_time = midi_data.get_end_time() + timestamp_grid = create_timestamp_array(max_time, standard_timestamp) + for midi_track in midi_data.instruments: + notes = midi_track.notes + for note in notes: + note.start = find_closest_element(note.start, timestamp_grid) + note.end = find_closest_element(note.end, timestamp_grid) + if note.end == note.start: + note.end = note.start + standard_timestamp + return midi_data + + +def note_alignment_16(midi_file): + midi_data = pretty_midi.PrettyMIDI(midi_file) + tempo = round(midi_data.get_tempo_changes()[1][0]) + standard_timestamp = 60 / tempo / 4 + max_time = midi_data.get_end_time() + timestamp_grid = create_timestamp_array(max_time, standard_timestamp) + for midi_track in midi_data.instruments: + notes = midi_track.notes + for note in notes: + note.start = find_closest_element(note.start, timestamp_grid) + note.end = find_closest_element(note.end, timestamp_grid) + if note.end == note.start: + note.end = note.start + standard_timestamp + return midi_data + + +def merge_midi_tracks(*midi, tgt_dir): + bpm = round(pretty_midi.PrettyMIDI(midi[0]).get_tempo_changes()[1][0]) + midi_data = pretty_midi.PrettyMIDI() + for midi_track in midi: + midi_track_data = pretty_midi.PrettyMIDI(midi_track) + if len(midi_track_data.instruments) > 0: + midi_data.instruments.append(midi_track_data.instruments[0]) + midi_data.write(tgt_dir) + tempo_adjustment(tgt_dir, tgt_dir, bpm) + note_alignment_16(tgt_dir).write(tgt_dir) + + +def midi_strip(midifile, tokenizer, tgt_folder, max_len): + tokens = tokenizer(midifile) + strip_tokens = TokSequence() + strip_tokens.ids = tokens.ids[0:max_len] + tokenizer.complete_sequence(strip_tokens) + strip_midi = tokenizer(strip_tokens) + strip_midi.dump_midi(tgt_folder) + + +def generate_midi(model, tokenizer, src_midi, max_len, PAD_ID, tgt_midi): + model.eval() + x = midi_to_array(tokenizer=tokenizer, midifile=src_midi, max_len=max_len) + x = torch.LongTensor(x).reshape(1, max_len) + y_input = torch.ones(1, max_len, + dtype=torch.long) * PAD_ID + y_input[0] = 1 + with torch.no_grad(): + for i in range(1, y_input.shape[1]): + y_hat = model(x, y_input) + y_input[0, i] = torch.argmax(y_hat[0, i - 1]) + tensor_to_midi(y_input[0][1:], tokenizer, tgt_midi) + + +def generate_midi_v2(model, tokenizer, src_midi, max_len, PAD_ID, tgt_midi): + model.eval() + x = midi_to_array(tokenizer=tokenizer, midifile=src_midi, max_len=max_len) + x = torch.LongTensor(x).reshape(1, max_len) + y_input = torch.ones(1, max_len, + dtype=torch.long) * PAD_ID + y_input[0] = 1 + with torch.no_grad(): + for i in range(1, y_input.shape[1]): + y_hat = model(x, y_input) + y_input[0, i] = torch.argmax(y_hat[0, i - 1]) + flag = 3 + for i in range(4, len(y_input[0]) - 1): + if y_input[0][i] != 3: + flag = i + break + y_ = [ + y_input[0][i] for i in range( + 0, len( + y_input[0])) if i not in range( + 4, flag)] + tensor_to_midi(y_[1:], tokenizer, tgt_midi)