-
Notifications
You must be signed in to change notification settings - Fork 27
/
Copy pathlist_cross_ref.R
executable file
·121 lines (91 loc) · 3.12 KB
/
list_cross_ref.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
library(httr)
library(xml2)
library(data.table)
library(stringr)
list_cross_ref <- function(issue, repo = "USCbiostats/PM566", timeout = 60) {
message("Connecting to GitHub...", appendLF = FALSE)
issues <- GET(
sprintf("https://github.com/%s/issues/%i", repo, issue),
config = config(connecttimeout = timeout)
)
message("done.")
page <- content(issues)
# Checking if there's anything to parse
items <- xml_find_all(page, xpath = '//*[@class="TimelineItem"]')
if (length(items) == 1 && xml_length(items) == 0)
stop("No cross-reference to be analized.")
items <- lapply(as.character(items), read_html)
# Getting the times
times <- lapply(items, xml_find_first, xpath = "//relative-time")
times <- sapply(times, xml_attr, attr = "datetime")
# Getting the users
users <- lapply(items, xml_find_first, xpath='//*[starts-with(@class, "author")]')
users <- sapply(users, xml_attr, attr = "href")
users <- stringr::str_remove(users, "^/")
# Getting the title
details <- lapply(items, xml_find_first, xpath='//*[starts-with(@class, "commit-message")]//a')
titles <- sapply(details, xml_attr, attr = "title")
# Getting URL
links <- sapply(details, xml_attr, attr = "href")
links <- paste0("https://github.com", links)
# Creating database
dat <- data.table(
user = users,
title = titles,
link = links,
timestamp = times
)
# Masking the original url
dat[, title := str_replace(
title, paste0("https://github.com/", repo, "/issues/", issue),
paste0("#", issue))]
# Removing newline
dat[, title := str_replace_all(title, "\\n+", " ")]
# Tagging the commits
dat[, type := fifelse(
str_detect(tolower(title), "hw|assignme"), "homework",
fifelse(
str_detect(tolower(title), "lab"),
"lab", NA_character_
)
)]
# Processing the time
dat[, timestamp := as.POSIXct(timestamp, format = "%Y-%m-%dT%H:%M:%OS")]
return(dat)
}
find_rmd_involved <- function(x) {
if (length(x) > 1) {
ans <- character(length(x))
message("Downloading data...", appendLF = FALSE)
for (i in 1:length(ans)) {
message(i, ", ", appendLF = FALSE)
ans[i] <- find_rmd_involved(x[i])
}
message("done.")
return(ans)
}
y <- tryCatch(GET(url = x), error = function(e) e)
if (inherits(y, "error"))
return(NA_character_)
if (!grepl("^2[0-9]{2}", status_code(y)) )
return(NA_character_)
# Getting the contents
y <- content(y)
# TOC
toc <- xml_find_first(y, xpath = '//*[@id = "toc"]')
toc <- xml_find_first(toc, xpath = '//*[starts-with(@class, "content")]')
toc <- xml_children(toc)
toc <- lapply(as.character(toc), read_html)
toc <- lapply(toc, xml_find_all, xpath = "//a")
# Getting the filenames
toc <- lapply(toc, xml_text, trim = TRUE)
toc <- unlist(toc)
toc <- toc[nchar(toc) > 0]
# Getting commit code
url_commit <- str_extract(x, "(?<=commit/).+")
url_repo <- gsub("/commit/.+", "", x, perl = TRUE)
links <- sprintf(
"%s/blob/%s/%s", url_repo, url_commit, toc
)
paste(sprintf("[%s](%s)", toc, links), collapse=", ")
}