Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pair programming #14

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
8 changes: 8 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/midi-internship.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 6 additions & 6 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
default_language_version:
python: python3.9.10
python: python3.10.12
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
rev: v4.4.0
hooks:
- id: check-merge-conflict
- id: end-of-file-fixer
Expand All @@ -11,19 +11,19 @@ repos:
- id: check-yaml
- id: check-docstring-first
- id: requirements-txt-fixer
- repo: https://gitlab.com/pycqa/flake8
rev: 3.9.2
- repo: https://github.com/pycqa/flake8
rev: 6.1.0
hooks:
- id: flake8
args: ["--max-line-length=180","--extend-ignore=E203","--per-file-ignores=.github/scripts/bump_version.py:E402"]
- repo: https://github.com/ambv/black
rev: 21.7b0
rev: 23.9.1
hooks:
- id: black
args: [--line-length=130]
additional_dependencies: ['click==8.0.4']
- repo: https://github.com/pycqa/isort
rev: 5.10.1
rev: 5.12.0
hooks:
- id: isort
name: isort
Expand Down
Binary file added myplot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
64 changes: 64 additions & 0 deletions sequence_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import numpy as np
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("roszcz/internship-midi-data-science", split="train")

record = dataset[0]
df = pd.DataFrame(record["notes"])
print(df.head())


def cos_sim_score(sequence: pd.DataFrame, window: pd.DataFrame) -> float:
"""
Calculating cosine similarity between sequence and window
Args:
sequence (pd.DataFrame): input sequence
window (pd.DataFrame): subset of rolling window
Returns:
float: cosine similarity score
"""
Comment on lines +12 to +20
Copy link
Member

@roszcz roszcz Sep 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def cos_sim_score(sequence: pd.DataFrame, window: pd.DataFrame) -> float:
"""
Calculating cosine similarity between sequence and window
Args:
sequence (pd.DataFrame): input sequence
window (pd.DataFrame): subset of rolling window
Returns:
float: cosine similarity score
"""
def cos_sim_score(sequence_a: pd.DataFrame, sequence_b: pd.DataFrame) -> float:
"""
Calculating cosine similarity between two sequences
Args:
sequence_a (pd.DataFrame): first sequence
sequence_b (pd.DataFrame): second sequence
Returns:
float: cosine similarity score
"""

This function is great, but name "window" doesn't really make sense from the point of view of this function - it measures distance between any two sequences and it's only in your specific use case that the second sequence is a "window" moving over the signal.


# extracting numpy array and transposing, shape: [num_features, window_size]
sequence_arr = sequence.values.T
# shape: [window_size, num_features]
window_arr = window.values

# det product shape: [features, features]
sequence_x_window = sequence_arr @ window_arr

# shape: [num_features, 1]
sequence_norm = np.linalg.norm(sequence_arr, axis=1, keepdims=True)
# shape: [1, num_features]
window_norm = np.linalg.norm(window_arr, axis=0, keepdims=True)

# shape: [num_features, num_features]
normalization = sequence_norm * window_norm

# calculating cosine similarity for each entry
cos_sim = sequence_x_window / (normalization + 1e-8)

num_features = cos_sim.shape[0]

# returning normalized trace of cosine similarity because the values of interest are along main diagonal
return np.sum(cos_sim) / (num_features * num_features)


x = df.iloc[0:16]
x = x[["pitch", "velocity"]]

scores = {"score": [], "idx": []}

seq_len = len(x)

for i in range(0, len(df) - seq_len):
seq = df.iloc[i : i + seq_len]
seq = seq[["pitch", "velocity"]]
score = cos_sim_score(x, seq)
scores["score"].append(score)
scores["idx"].append(i)
Comment on lines +47 to +59
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
x = df.iloc[0:16]
x = x[["pitch", "velocity"]]
scores = {"score": [], "idx": []}
seq_len = len(x)
for i in range(0, len(df) - seq_len):
seq = df.iloc[i : i + seq_len]
seq = seq[["pitch", "velocity"]]
score = cos_sim_score(x, seq)
scores["score"].append(score)
scores["idx"].append(i)
target_sequence = df.iloc[0:16]
target_sequence = target_sequence[["pitch", "velocity"]]
scores = {"score": [], "idx": []}
seq_len = len(target_sequence)
for i in range(0, len(df) - seq_len):
sequence_window = df.iloc[i : i + seq_len]
sequence_window = sequence_window[["pitch", "velocity"]]
score = cos_sim_score(sequence_a=target_sequence, sequence_b=sequence_window)
scores["score"].append(score)
scores["idx"].append(i)

Logic's good, these are just readability suggestions 👍


similarity = pd.DataFrame(scores)
similarity.sort_values(by="score", ascending=False, inplace=True)

print(similarity)
Comment on lines +47 to +64
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
x = df.iloc[0:16]
x = x[["pitch", "velocity"]]
scores = {"score": [], "idx": []}
seq_len = len(x)
for i in range(0, len(df) - seq_len):
seq = df.iloc[i : i + seq_len]
seq = seq[["pitch", "velocity"]]
score = cos_sim_score(x, seq)
scores["score"].append(score)
scores["idx"].append(i)
similarity = pd.DataFrame(scores)
similarity.sort_values(by="score", ascending=False, inplace=True)
print(similarity)
if __name__ == "__main__":
# Moved this from the top of the script
dataset = load_dataset("roszcz/internship-midi-data-science", split="train")
record = dataset[0]
df = pd.DataFrame(record["notes"])
print(df.head())
x = df.iloc[0:16]
x = x[["pitch", "velocity"]]
scores = {"score": [], "idx": []}
seq_len = len(x)
for i in range(0, len(df) - seq_len):
seq = df.iloc[i : i + seq_len]
seq = seq[["pitch", "velocity"]]
score = cos_sim_score(x, seq)
scores["score"].append(score)
scores["idx"].append(i)
similarity = pd.DataFrame(scores)
similarity.sort_values(by="score", ascending=False, inplace=True)
print(similarity)

Having code outside of if __name__ == "__main__" will execute it every time you try to import this file (i.e. from sequence_similarity import cos_sim_score), and it's usually better to avoid that :)

47 changes: 47 additions & 0 deletions speed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset


def plot_speed_time(df: pd.DataFrame) -> plt.Figure:
duration = df["end"].max() - df["start"][0]

if duration > 120:
time_unit = "minutes"
else:
time_unit = "seconds"

bins = []
for index, row in df.iterrows():
if time_unit == "minutes":
bins.append(round(row["end"] / 60))
else:
bins.append(round(row["end"]))

df["bin"] = bins

notes_per_minutes = {}

for index, row in df.iterrows():
if row["bin"] in notes_per_minutes:
notes_per_minutes[row["bin"]] += 1
else:
notes_per_minutes[row["bin"]] = 1

x = list(notes_per_minutes.keys())
y = list(notes_per_minutes.values())

fig, ax = plt.subplots()
ax.plot(x, y)
# ax.scatter(x, y)
ax.set(xlabel="Time (" + time_unit + ")", ylabel="Number of notes", title="Number of notes over time") # show the plot

return fig


if __name__ == "__main__":
dataset = load_dataset("roszcz/internship-midi-data-science", split="train")
for record in dataset:
df = pd.DataFrame(record["notes"])
fig = plot_speed_time(df)
plt.show()