Skip to content

Commit

Permalink
Merge pull request #91 from fhdsl/cansavvy/github_spec
Browse files Browse the repository at this point in the history
Add ability to specify which github stats to return
  • Loading branch information
kweav authored Oct 3, 2024
2 parents ebb0bc1 + 595aa2f commit dd2e3df
Showing 1 changed file with 97 additions and 22 deletions.
119 changes: 97 additions & 22 deletions R/github.R
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,11 @@ get_user_repo_list <- function(owner, count = 100000, data_format = "dataframe",
#' @param token You can provide the Personal Access Token key directly or this function will attempt to grab a PAT that was stored using the `authorize("github")` function
#' @param repo The repository name. So for `https://github.com/fhdsl/metricminer`, it would be `fhdsl/metricminer`
#' @param count How many items would you like to receive? default is 100000
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param time_course Should the time course data be collected or only the summary metrics?
#' @return Repository summary or time course metrics for a particular GitHub repository as a dataframe
Expand All @@ -155,17 +160,40 @@ get_user_repo_list <- function(owner, count = 100000, data_format = "dataframe",
#' authorize("github")
#' metrics <- get_github_metrics(repo = "fhdsl/metricminer")
#'
#' # If you only want some of the stats you can choose which ones with the github_stats argument
#' metrics <- get_github_metrics(repo = "fhdsl/metricminer", github_stats = c("repo_activity", "stars"))
#'
#' summary_metrics <- get_github_repo_summary(repo = "fhdsl/metricminer")
#' timecourse_metrics <- get_github_repo_timecourse(repo = "fhdsl/metricminer")
#' }
get_github_metrics <- function(repo, token = NULL, count = 100000, data_format = "dataframe", time_course = FALSE) {
get_github_metrics <- function(repo,
token = NULL,
count = 100000,
data_format = "dataframe",
github_stats = "all",
time_course = FALSE) {

if (is.null(token)) {
# Get auth token
token <- get_token(app_name = "github")
if (is.null(token)) warning("No token found. Only public repositories will be retrieved.")
}

# There's different variables depending on whether time_course data is being collected
if (time_course) {
github_stats_full_list <- c("clones", "views")
} else {
github_stats_full_list <- c("repo_activity", "stars", "forks", "contributors", "community")
}

# Need to make sure what the user specified is what we collect
if (github_stats[1] != "all") {
if (!all(github_stats %in% github_stats_full_list)) stop("The stats specified in the github_stats argument are not supported stats. Please see documentation.")
} else {
github_stats <- github_stats_full_list
}


# Split it up
split_it <- strsplit(repo, split = "\\/")
owner <- split_it[[1]][1]
Expand All @@ -185,6 +213,9 @@ get_github_metrics <- function(repo, token = NULL, count = 100000, data_format =
community = "GET /repos/{owner}/{repo}/community/profile"
)
}

api_calls <- api_calls[github_stats]

# Put gh_repo_wrapper inside function
gh_repo_wrapper_fn <- function(api_call) {
gh_repo_wrapper(
Expand Down Expand Up @@ -242,6 +273,11 @@ get_github_metrics <- function(repo, token = NULL, count = 100000, data_format =
#' @param repo The repository name. So for `https://github.com/fhdsl/metricminer`, it would be `fhdsl/metricminer`
#' @param count How many items would you like to receive? default is 100000
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @return GitHub repository timecourse metrics for views and clones
#' @export
#' @examples \dontrun{
Expand All @@ -250,12 +286,13 @@ get_github_metrics <- function(repo, token = NULL, count = 100000, data_format =
#'
#' timecourse_metrics <- get_github_repo_timecourse(repo = "fhdsl/metricminer")
#' }
get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_format = "dataframe") {
get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_format = "dataframe", github_stats = "all") {
result <- get_github_metrics(
repo = repo,
token = token,
count = count,
data_format = data_format,
github_stats = github_stats,
time_course = TRUE
)
return(result)
Expand All @@ -268,6 +305,11 @@ get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_
#' @param repo The repository name. So for `https://github.com/fhdsl/metricminer`, it would be `fhdsl/metricminer`
#' @param count How many items would you like to receive? default is 100000
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @return GitHub repository summary metrics
#' @export
#' @examples \dontrun{
Expand All @@ -276,12 +318,13 @@ get_github_repo_timecourse <- function(repo, token = NULL, count = 100000, data_
#'
#' summary_metrics <- get_github_repo_summary(repo = "fhdsl/metricminer")
#' }
get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_format = "dataframe") {
get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_format = "dataframe", github_stats = "all") {
result <- get_github_metrics(
repo = repo,
token = token,
count = count,
data_format = data_format,
github_stats = github_stats,
time_course = FALSE
)

Expand All @@ -295,7 +338,12 @@ get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_for
#' @param repo_names a character vector of repositories you'd like to collect metrics from.
#' @param data_format Default is to return a curated data frame. However if you'd like to see the raw information returned from GitHub set format to "raw".
#' @param time_course Should the time course data be collected or only the summary metrics?
#' @return Information regarding a Github account
#' @param github_stats Which stats would you like to collect from the GitHub API?
#' Argument should be a vector of the names of the stats to be collected. This differs whether time_course is TRUE/FALSE.
#' If time_course = FALSE should be a vector that can include: "repo_activity", "stars", "forks", "contributors", "community"
#' If time_course = TRUE should be a vector that can include: "clones" and "views".
#' By default "all" will be collected.
#' @return A list of metrics for a list of repos in a dataframe format
#' @importFrom gh gh
#' @importFrom purrr map
#' @importFrom dplyr bind_rows
Expand All @@ -307,10 +355,12 @@ get_github_repo_summary <- function(repo, token = NULL, count = 100000, data_for
#' repo_names <- c("fhdsl/metricminer", "jhudsl/OTTR_Template")
#' some_repos_metrics <- get_multiple_repos_metrics(repo_names = repo_names)
#'
#' stars_and_forks <- get_multiple_repos_metrics(repo_names = repo_names, github_stats = c("stars", "forks"))
#'
#' some_repos_metrics <- get_multiple_repos_metrics(repo_names = repo_names, time_course = TRUE)
#' }
#'
get_multiple_repos_metrics <- function(repo_names = NULL, token = NULL, data_format = "dataframe", time_course = FALSE) {
get_multiple_repos_metrics <- function(repo_names = NULL, token = NULL, data_format = "dataframe", time_course = FALSE, github_stats = "all") {
if (is.null(token)) {
# Get auth token
token <- get_token(app_name = "github", try = TRUE)
Expand All @@ -326,7 +376,8 @@ get_multiple_repos_metrics <- function(repo_names = NULL, token = NULL, data_for
token = token,
repo = repo,
data_format = data_format,
time_course = time_course
time_course = time_course,
github_stats = github_stats
)
})

Expand Down Expand Up @@ -388,8 +439,15 @@ gh_repo_wrapper <- function(api_call, owner, repo, token = NULL, count = 100000)
#' @export
#'
clean_repo_metrics <- function(repo_name, repo_metric_list) {
### Summarize the rest
if (repo_metric_list$contributors[1] != "No results") {

stats_collected <- names(repo_metric_list)
cleaned_metrics <- list()

if (is.list(repo_metric_list$repo_activity)) {
cleaned_metrics$num_repo_activities <- length(repo_metric_list$repo_activity)
}
# If there's not no results and there's not an error because the $contributor field doesn't exist THEN do this.
if (is.list(repo_metric_list$contributors)) {
contributors <-
lapply(repo_metric_list$contributors, function(contributor) {
data.frame(
Expand All @@ -400,29 +458,46 @@ clean_repo_metrics <- function(repo_name, repo_metric_list) {
dplyr::bind_rows() %>%
dplyr::distinct()

num_contributors <- length(unique(contributors$contributor))
total_contributors <- sum(contributors$num_contributors)
cleaned_metrics$num_contributors <- length(unique(contributors$contributor))
cleaned_metrics$total_contributors <- sum(contributors$num_contributors)
} else {
num_contributors <- NA
total_contributors <- NA
cleaned_metrics$num_contributors <- NA
cleaned_metrics$total_contributors <- NA
}

if (repo_metric_list$forks[1] != "No results") {
if (is.list(repo_metric_list$forks)) {
forks <- unlist(purrr::map(repo_metric_list$forks, "full_name"))
num_forks <- length(forks)
cleaned_metrics$num_forks <- length(forks)
} else {
cleaned_metrics$num_forks <- NA
}

if (is.list(repo_metric_list$stars)) {
# GitHub API returns a list of who's given a star we just want a number
cleaned_metrics$num_stars <- length(unlist(purrr::map(repo_metric_list$stars, "login")))
} else {
num_forks <- NA
cleaned_metrics$num_stars <- NA
}

if (is.list(repo_metric_list$community)) {
cleaned_metrics$health_percentage <- as.numeric(repo_metric_list$community$health_percentage)
} else {
cleaned_metrics$health_percentage <- NA
}

clean_stats_names <- list(
repo_activity = "num_repo_activities",
contributors = c("num_contributors","total_contributors"),
forks = "num_forks",
stars = "num_stars",
community = "health_percentage")

# We're going to only put the stats we collected in the data.frame
clean_stats_names <- unlist(clean_stats_names[stats_collected], use.names = FALSE)

metrics <- data.frame(
repo_name,
num_forks = num_forks,
num_contributors = num_contributors,
total_contributions = total_contributors,
num_stars = length(unlist(purrr::map(repo_metric_list$stars, "login"))),
health_percentage = ifelse(repo_metric_list$community[1] != "No results", as.numeric(repo_metric_list$community$health_percentage), NA)
cleaned_metrics[clean_stats_names]
)

rownames(metrics) <- repo_name

return(metrics)
Expand Down

0 comments on commit dd2e3df

Please sign in to comment.