Skip to content

Commit

Permalink
Add comments in analyze/data_analysis.py
Browse files Browse the repository at this point in the history
Signed-off-by: Priyanshi Gaur <[email protected]>
  • Loading branch information
nox1134 committed Mar 11, 2024
1 parent 876c6ae commit 602bd27
Showing 1 changed file with 64 additions and 16 deletions.
80 changes: 64 additions & 16 deletions analyze/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,27 @@
import seaborn as sns

warnings.filterwarnings("ignore")

# Third-party
from wordcloud import STOPWORDS, WordCloud # noqa: E402

# Set the current working directory
CWD = os.path.dirname(os.path.abspath(__file__))


def tags_frequency(csv_path, column_names):
# attribute csv_path is string
# attribute column_names is a list
# i.e. column_names = ["tags", "description"]
"""
This function is to generate a word cloud
based on all the tags of each license
each license one cloud
Generate a word cloud based on all the tags of each license.
Each license has its own cloud.
Args:
- csv_path (str): Path to the CSV file containing data.
- column_names (list): List of column names to process.
Example: ["tags", "description"]
"""
df = pd.read_csv(csv_path)
# Process each column containing tags
for column_name in column_names:
list2 = []
if column_name == "tags":
Expand All @@ -56,7 +61,7 @@ def tags_frequency(csv_path, column_names):
text = ""
stopwords = set(STOPWORDS)

# The stop words can be customized based on diff cases
# Customize stop words for the word cloud
flickr_customized = {
"nan",
"https",
Expand Down Expand Up @@ -103,7 +108,7 @@ def tags_frequency(csv_path, column_names):
# Join the lowercase words with a space separator
text = " ".join(lowercase_words)

# Creating the word cloud
# Creating WordCloud
tags_word_cloud = WordCloud(
width=800,
height=800,
Expand All @@ -112,7 +117,7 @@ def tags_frequency(csv_path, column_names):
min_font_size=10,
).generate(text)

# Plotting the word cloud
# Plotting the WordCloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(tags_word_cloud, interpolation="bilinear")
plt.axis("off")
Expand All @@ -129,12 +134,21 @@ def tags_frequency(csv_path, column_names):


def time_trend_helper(df):
"""
Extract year-wise count of entries from a DataFrame.
Args:
df (DataFrame): Input DataFrame containing dates.
Returns:
DataFrame: DataFrame with counts of entries per year.
"""
year_list = []
for date_row in df["dates"][0:]:
date_list = str(date_row).split()
year_list.append(date_list[0])
df["Dates"] = year_list

# Count occurrences of each year
# Use rename_axis for name of column from index and reset_index
count_df = (
df["Dates"]
Expand All @@ -143,11 +157,18 @@ def time_trend_helper(df):
.rename_axis("Dates")
.reset_index(name="Counts")
)
# Remove first and last rows
count_df = count_df.drop([0, len(count_df) - 1])
return count_df


def time_trend(csv_path):
"""
Generate a line graph to show the time trend of the license usage.
Args:
csv_path (str): Path to the CSV file.
"""
df = pd.read_csv(csv_path)
count_df = time_trend_helper(df)

Expand Down Expand Up @@ -181,9 +202,13 @@ def time_trend(csv_path):

def time_trend_compile_helper(yearly_count):
"""
yearly_count is the dataframe with "year" and "Counts" as two columns
This function will return counts - the list of "Counts" with the
condition that their corresponding "year" is between [2000, 2022]
Filter yearly trend data for the years between 2018 and 2022.
Args:
yearly_count (DataFrame): DataFrame with "year" and "Counts" columns.
Returns:
DataFrame: Filtered yearly count data.
"""
Years = np.arange(2018, 2023)
yearly_count["year"] = list(yearly_count.index)
Expand All @@ -201,6 +226,9 @@ def time_trend_compile_helper(yearly_count):


def time_trend_compile():
"""
Compile yearly trends for different licenses and plot them.
"""
license1 = pd.read_csv("../flickr/dataset/cleaned_license1.csv")
license2 = pd.read_csv("../flickr/dataset/cleaned_license2.csv")
license3 = pd.read_csv("../flickr/dataset/cleaned_license3.csv")
Expand All @@ -209,6 +237,7 @@ def time_trend_compile():
license6 = pd.read_csv("../flickr/dataset/cleaned_license6.csv")
license9 = pd.read_csv("../flickr/dataset/cleaned_license9.csv")
license10 = pd.read_csv("../flickr/dataset/cleaned_license10.csv")
# Calculate yearly counts for each license
count_df1 = time_trend_helper(license1)
count_df2 = time_trend_helper(license2)
count_df3 = time_trend_helper(license3)
Expand Down Expand Up @@ -240,7 +269,6 @@ def time_trend_compile():
each_raw_data.dropna(how="all")
list_data.append(each_raw_data)

# We set years are from 2000 to 2022
yearly_count1 = list_data[0].to_frame()
yearly_count2 = list_data[1].to_frame()
yearly_count3 = list_data[2].to_frame()
Expand All @@ -249,6 +277,7 @@ def time_trend_compile():
yearly_count6 = list_data[5].to_frame()
yearly_count9 = list_data[6].to_frame()
yearly_count10 = list_data[7].to_frame()
# Filter yearly count data for the years between 2018 and 2022
yearly_count1 = time_trend_compile_helper(yearly_count1)
yearly_count2 = time_trend_compile_helper(yearly_count2)
yearly_count3 = time_trend_compile_helper(yearly_count3)
Expand All @@ -259,7 +288,7 @@ def time_trend_compile():
yearly_count10 = time_trend_compile_helper(yearly_count10)
print(yearly_count1)

# plot lines
# Plot yearly trend for all licenses
plt.plot(
yearly_count1["Years"],
yearly_count1["Yearly_counts"],
Expand Down Expand Up @@ -337,6 +366,15 @@ def time_trend_compile():


def view_compare_helper(df):
"""
Calculate maximum views of pictures under a license.
Args:
df (DataFrame): Input DataFrame.
Returns:
int: Maximum views.
"""
highest_view = int(max(df["views"]))
df = df.sort_values("views", ascending=False)
return highest_view
Expand All @@ -345,6 +383,9 @@ def view_compare_helper(df):


def view_compare():
"""
Compare maximum views of pictures under different licenses.
"""
license1 = pd.read_csv(
os.path.join(CWD, "../flickr/dataset/cleaned_license1.csv")
)
Expand Down Expand Up @@ -379,10 +420,12 @@ def view_compare():
license9,
license10,
]
# Calculate maximum views for each license
maxs = []
for lic in licenses:
maxs.append(view_compare_helper(lic))
print(maxs)
# Create DataFrame to store license and their maximum views
temp_data = pd.DataFrame()
temp_data["Licenses"] = [
"CC BY-NC-SA 2.0",
Expand All @@ -395,6 +438,7 @@ def view_compare():
"Public Domain Mark 1.0",
]
temp_data["views"] = maxs
# Plot bar graph
fig, ax = plt.subplots(figsize=(13, 10))
ax.grid(b=True, color="grey", linestyle="-.", linewidth=0.5, alpha=0.6)
sns.set_style("dark")
Expand Down Expand Up @@ -433,7 +477,10 @@ def view_compare():


def total_usage():
# this will use the license total file as input dataset
"""
Generate a bar plot showing the total usage of different licenses.
"""
# Reads the license total file as the input dataset
df = pd.read_csv(os.path.join(CWD, "../flickr/dataset/license_total.csv"))
df["License"] = [str(x) for x in list(df["License"])]
fig = px.bar(df, x="License", y="Total amount", color="License")
Expand All @@ -448,6 +495,7 @@ def main():


if __name__ == "__main__":
# Exception handling
try:
main()
except SystemExit as e:
Expand Down

0 comments on commit 602bd27

Please sign in to comment.