This repository has been archived by the owner on Jun 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 61
/
Copy pathcheck-markdown-links.rb
executable file
·152 lines (135 loc) · 4.17 KB
/
check-markdown-links.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/env ruby
require 'open-uri'
require 'pathname'
require 'pp'
DO_NOT_CHECK_HTTP_LINKS = !(ENV["DO_NOT_CHECK_HTTP_LINKS"].to_s =~ /^(|0|no|false)$/)
if !DO_NOT_CHECK_HTTP_LINKS
puts "Checking remote URLs for 404 errors (set DO_NOT_CHECK_HTTP_LINKS to disable)"
end
# 1. Get all links
# 2. For each link, check if it's good:
# a. If it's local anchor, check if it's in the list of anchors.
# b. If it's local file, fetch that file and check if it's in that file's anchors.
# c. If it's global URL, fetch URL to see if it's not 404 and well-formed.
$check_links_failed = false
def main
dataset = {}
markdown_filepaths = Dir["**/*.md"]
markdown_filepaths.delete_if{|f| f =~ %r{/vendor/} }
markdown_filepaths.each do |file|
#$stderr.puts "Collecting links and anchors from #{file}..."
collect_links_and_anchors(file, dataset)
end
markdown_filepaths.each do |file|
puts "Checking links in #{file}"
check_links(file, dataset[file][:links], dataset)
end
end
def canonicalize_path_in_file(relpath, srcfilepath)
abspath = File.expand_path(relpath, File.dirname(srcfilepath))
# make relative to the current directory
Pathname.new(abspath).relative_path_from(Pathname.new(Dir.pwd)).to_s
end
def collect_links_and_anchors(fp, dataset={})
f = File.read(fp) || ""
dataset[fp] ||= {links:nil, anchors:nil}
ds = dataset[fp]
ds[:links] ||= begin
links = []
f.dup.split("\n").each_with_index do |line, line_index|
line.scan(%r{\[([^\]]*)\]\(([^\)]*)\)}m).each do |(title, ref)|
lineno = line_index + 1
links << [title, ref, lineno]
end
end
links
end
ds[:anchors] ||= begin
extract_anchors(f)
end
end
def check_links(file, links, dataset = {})
dataset["__checked_remote_urls"] ||= {}
cache = dataset["__checked_remote_urls"]
links.each do |(name, ref, lineno)|
if ref[0,1] == "#"
if !dataset[file][:anchors].include?(ref)
$stderr.puts "#{file}:#{lineno}: invalid anchor: [#{name}](#{ref})"
$check_links_failed = true
end
elsif ref =~ %r{^https?://}
if !check_url(ref, cache)
$stderr.puts "#{file}:#{lineno}: external file does not load: [#{name}](#{ref})"
$check_links_failed = true
end
else # cross-file link
ref = ref.sub(%r{^\./},"")
fn, anchor = ref.split("#")
anchor = "##{anchor}" if anchor
linked_fn = canonicalize_path_in_file(fn, file)
if f = dataset[linked_fn]
if !anchor
# do nothing - we don't link to anything
elsif !f[:anchors].include?(anchor)
$stderr.puts "#{file}:#{lineno}: invalid anchor: [#{name}](#{ref}) (check headings in #{linked_fn})"
$check_links_failed = true
end
else
if !anchor && check_url(linked_fn, cache)
# the reference is fine: the non-markdown file exists somewhere and we link to it as a whole
else
$stderr.puts "#{file}:#{lineno}: referenced file does not exists: [#{name}](#{ref}) (expanded to #{linked_fn})"
$check_links_failed = true
end
end
end
end
end
def check_url(url, cache = {})
return true if cache[url]
if url == "https://dx.doi.org/10.6028/NIST.FIPS.202"
true
elsif DO_NOT_CHECK_HTTP_LINKS && url =~ /^https?:/
true
elsif Dir.exists?(url)
cache[url] = "ok"
true
else
# check that file exists
x = open(url).read rescue nil
exists = !!x
cache[url] = exists ? "ok" : "failed"
exists
end
end
def extract_anchors(data)
results = [] # list of anchors
data.split("\n").each do |line|
if h = extract_heading(line)
depth, title, anchor = h
results << anchor
end
end
results
end
# Returns `nil` or `[depth, title, anchor]`
def extract_heading(line)
if line =~ /^(#+)\s(.*)/
prefix = $1
title = $2
depth = prefix.size
anchor = "#" + title.
downcase.
gsub(/[\/:]/,""). # titles like "/url/:id" are transformed by Github into "#urlid" anchors
gsub(/\W+/,"-").gsub(/(\d)\-(\d)/,"\\1\\2").
gsub(/^\-+/,"").
gsub(/\-+$/,"")
[depth, title, anchor]
end
end
main
if $check_links_failed
exit(1)
else
puts "All links seem to be good."
end