The GithubMetrics package

Easy access to GithubMetrics via a gh wrapper

At work I manage a data science team, and the backbone to our work is an on-premise Github server. This holds our research code, as well as pan-study code (e.g. packages and libraries). To help keep on top of our codebase, we use the Github API. To make it easier to manage this codebase, I threw some of these functions into an R package called GithubMetrics.

The aim of this package is to provide a wrapper on gh to quickly get you key Github repo information you need.The code here is used within Roche to quickly let me pull answer simple questions like:

  • How many studies have more than 1 data scientist (and roughly what’s the commit split)
  • What are the common languages being used (proxied through file type distribution within repos)
  • Pull commit metadata to enrich other study info held in other systems

Table of Contents

Setup

# devtools::install_github("OpenPharma/GithubMetrics")
library(GithubMetrics)
library(glue)
library(tidyverse)

organisation <- "openpharma"

Info on the repos

Quickly pull info on all the repos in a particular org. Here I look at the organisation called OpenPharma.

repos <- organisation %>%
  gh_repos_get() %>%
  gh_repos_clean()

repos %>%
  mutate(days_since_updated = Sys.Date() - as.Date(updated_at)) %>%
  arrange(days_since_updated) %>% select(name,language,MB,days_since_updated) %>%
  knitr::kable()
namelanguageMBdays_since_updated
GithubMetricsR0.10 days
BBS-causality-trainingR0.01 days
visRHTML20.81 days
facetsrR2.161 days
CTPR0.985 days
simaerepR77.686 days
ReadStatC1.8126 days
visR-docsUnsure5.3131 days
sas7bdatPython0.1141 days
syntrialR0.3199 days
icd_hierarchiesUnsure0.0267 days
pypharma_nlpJupyter Notebook28.0289 days
RDOR0.5327 days
openpharma.github.ioJavaScript0.91315 days

Get all commits

Now I can pull all the commits on the main branch across repos in that org.

repo_all_commits <- repos %>%
  filter(size > 0) %>% # make sure has some commits
  pull(full_name) %>%
  gh_commits_get(
    days_back = 365*10
  )

## Pulling commits looking back to 2011-02-02

repo_all_commits %>%
  filter(!author %in% c(".gitconfig missing email","actions-user")) %>%
  mutate(
    repo = gsub("openpharma/","",full_name)
  ) %>%
  group_by(repo) %>%
  summarise(
    commits = n(),
    contributors = n_distinct(author),
    last_commit = max(as.Date(datetime))
  ) %>% arrange(desc(commits)) %>%
  knitr::kable()
repocommitscontributorslast_commit
ReadStat992132020-09-04
visR380142020-11-19
pypharma_nlp11012020-04-16
sas7bdat8672020-09-10
RDO4212020-03-09
GithubMetrics1912021-01-30
CTP632020-10-19
simaerep612020-11-05
BBS-causality-training312021-01-29
visR-docs312020-09-21
openpharma.github.io212017-06-25
syntrial212020-07-15
facetsr112020-11-30

Get visR commits

Now digging into a single repo, for the R package visR.

visr_all_commits <- "OpenPharma/visR" %>%
  gh_commits_get(
    days_back = 365*10
  ) %>%
  mutate(date = as.Date(datetime))

## Pulling commits looking back to 2011-02-02

visr_all_commits %>%
  filter(!author %in% c(".gitconfig missing email")) %>%
  ggplot(aes(x = date)) +
  stat_bin(aes(y=cumsum(..count..)),geom="step", binwidth = 1) +
  ggthemes::theme_hc() +
  labs(
    x = "Date",
    y = "Commits",
    title = "Cumulative commit count for OpenPharma/visR",
    subtitle =
      glue("{nrow(visr_all_commits)} commits were made to master since project started (First commit: {min(visr_all_commits$date)})"),
    caption = paste0("Data collected on ",Sys.Date())
  )

Who has been contributing to visR?

contributors <- visr_all_commits %>%
  filter(!author %in% c(".gitconfig missing email","actions-user")) %>%
  group_by(author) %>%
  summarise(
    commits = n()
  )

contributors <- contributors %>%
  left_join(
    gh_user_get(contributors$author),
    by = c("author"="username")
  )

contributors %>%
  arrange(-commits) %>%
  mutate(
    last_on_github = Sys.Date() - last_active,
    contributor = glue('<img src="{avatar}" alt="" width="30"> {author}'),
    blog = case_when(
      blog == "" ~ "",
      TRUE ~ as.character(glue('<a href="{blog}">link</a>'))
      )
    ) %>%
  select(contributor,commits,name,last_on_github,company,location,blog) %>%
  knitr::kable(
    caption = "People that have contributed to visR master"
  )
contributorcommitsnamelast_on_githubcompanylocationblog
SHAESEN2127Steven Haesendonckx16 days
bailliem109Mark Baillie0 daysBasel, CHlink
epijim69James Black1 daysRocheBasel, Switzerlandlink
Jonnie-Bevan2559 days
cschaerfe21Charlotta114 days
diego-s12Diego S251 days
rebecca-albrecht41 days
dazim3Tim Treis19 daysHeidelberg, Germany
kentm43Matt Kent4 daysGenesis Research
kawap2285 daysRoche / 7N
thomas-neitmann2Thomas Neitmann14 daysRocheBasel, Switzerlandlink
galachad1Adam Foryś16 days@RocheWarsaw, Polandlink
ginberg110 daysRemotelink
thanos-siadimas168 days

Explore the files present

Now use the API to explore files present in head across repos in this org. Just for fun I’ll compare R to Python files present.

repo_files <- gh_repo_files_get(
  repo_commits = repo_all_commits,
  only_last_commit = TRUE
)

## Pulling files in latest commit from 13 repos

repo_files %>%
  group_by(repo) %>%
  summarise(
    Files = n(),
    `R files` = sum(lang %in% "R"),
    `Python files` = sum(lang %in% c("Python","Jupyter Notebook"))
  ) %>%
  mutate(
    Language = case_when(
      `R files` > `Python files` ~ "R",
      `R files` < `Python files` ~ "Python",
      TRUE ~ "?"
    )
  ) %>%
  knitr::kable(
    caption = "Types of files in the organisation"
  )
repoFilesR filesPython filesLanguage
openpharma/BBS-causality-training420R
openpharma/CTP100300R
openpharma/facetsr63130R
openpharma/GithubMetrics44220R
openpharma/openpharma.github.io7610R
openpharma/pypharma_nlp131049Python
openpharma/RDO105110R
openpharma/ReadStat20700?
openpharma/sas7bdat802Python
openpharma/simaerep145320R
openpharma/syntrial67240R
openpharma/visR177810R
openpharma/visR-docs18500?

Search for code

And as a toy example of searching for code. Note that it is a plain text search, so there will be false positives, particularly if the package name is common (I think here that’s less of an issue).

helper_gh_repo_search <- function(x, org = "openpharma"){

  ## Slow it down! as search has 30 calls a minute rate limit.
  ## If you prem the search rate limit is higher, so usually not needed
  if(interactive()){message("Wait 5 seconds")}
  Sys.sleep(5)
  ## End slow down


   results <- gh_repo_search(
      code = x,
      organisation = org
    )

  if(is.na(results)) {
    results <- return()
  }
  results %>%
    mutate(Package = x, Organisation = org) %>%
    group_by(Organisation,Package) %>%
    summarise(
      Repos = n_distinct(full_name), .groups = "drop"
    )
}

packages <- c(
  "tidyverse","pkgdown","dplyr","data.table"
  )

package_use <- bind_rows(
  packages %>%
    map_df(
      helper_gh_repo_search, org = "openpharma"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "AstraZeneca"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Roche"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Genentech"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Novartis"
    )
)

## tidyverse does not appear in AstraZeneca.
## pkgdown does not appear in AstraZeneca.
## data.table does not appear in AstraZeneca.
## query = 'data.table in:file  user:AstraZeneca'

package_use %>%
  pivot_wider(names_from = "Package", values_from = "Repos") %>%
  mutate(Total = rowSums(.[,-1], na.rm = TRUE)) %>%
  arrange(-Total) %>%
  knitr::kable(
    caption = "Package use detected within repositaries in Pharma orgs"
  )
Organisationtidyversepkgdowndplyrdata.tableTotal
Novartis46101232
openpharma466218
Roche332311
Genentech323311
AstraZeneca11
James Black
James Black
PhD (Cantab)

James Black. Kiwi | Epidemiologist | Data Scientist | Engineering enthusiast.

comments powered by Disqus

Related