forked from IBM/guestbook
-
Notifications
You must be signed in to change notification settings - Fork 2
/
.verify-links.sh
executable file
·274 lines (232 loc) · 7.98 KB
/
.verify-links.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#!/bin/bash
# Copyright 2017 The Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This script will scan all md (markdown) files for bad references.
# It will look for strings of the form [...](...) and make sure that
# the (...) points to either a valid file in the source tree or, in the
# case of it being an http url, it'll make sure we don't get a 404.
#
# Usage: verify-links.sh [ dir | file ... ]
# default arg is root of our source tree
set -o errexit
set -o nounset
set -o pipefail
verbose=""
debugFlag=""
maxRetries="1"
stop=""
tmp=/tmp/out${RANDOM}
trap clean EXIT
seenFiles=( ":" ) # just to prevent "undefined" errors
# findPrevious will search for a file to see if we've seen it before.
# If we have then return the matching "anchorFile". If we haven't
# seen it then add it to "seenFiles" and create a new "anchorFile".
# $1 == search file
# Note we can't use a map because bash on a mac doesn't support it.
foundAnchor=""
function findPreviousFile() {
for f in "${seenFiles[@]}" ; do
orig=${f%%:*}
if [[ "${orig}" == "$1" ]]; then
foundAnchor=${f#*:}
return 0
fi
done
# Didn't it so create a new anchorFile and save it for next time
foundAnchor="${tmp}-anchors-${RANDOM}-${RANDOM}"
seenFiles+=("$1:${foundAnchor}")
return 1
}
function debug {
if [[ "$debugFlag" != "" ]]; then
(>&2 echo $*)
fi
}
function clean {
rm -f ${tmp}*
}
while [[ "$#" != "0" && "$1" == "-"* ]]; do
opts="${1:1}"
while [[ "$opts" != "" ]]; do
case "${opts:0:1}" in
v) verbose="1" ;;
d) debugFlag="1" ; verbose="1" ;;
t) maxRetries="5" ;;
-) stop="1" ;;
?) echo "Usage: $0 [OPTION]... [DIR|FILE]..."
echo "Verify all links in markdown files."
echo
echo " -v show each file as it is checked"
echo " -d show each href as it is found"
echo " -t retry GETs to http(s) URLs 5 times"
echo " -? show this help text"
echo " -- treat remainder of args as dir/files"
exit 0 ;;
*) echo "Unknown option '${opts:0:1}'"
exit 1 ;;
esac
opts="${opts:1}"
done
shift
if [[ "$stop" == "1" ]]; then
break
fi
done
# echo verbose:$verbose
# echo debugFlag:$debugFlag
# echo args:$*
arg=""
if [ "$*" == "" ]; then
arg="."
fi
mdFiles=$(find $* $arg -name "*.md" | sort | grep -v vendor | grep -v glide)
clean
for file in ${mdFiles}; do
# echo scanning $file
dir=$(dirname $file)
[[ -n "$verbose" ]] && echo "> $file"
# Replace ) with )\n so that each possible href is on its own line.
# Then only grab lines that have [..](..) in them - put results in tmp file.
# If the file doesn't have any lines with [..](..) then skip this file
# Steps:
# tr - convert all \n to a space since newlines shouldn't change anything
# sed - add a \n after each ) since ) ends what we're looking for.
# This makes it so that each href is on a line by itself
# sed - prefix each line with a space so the grep can do [^\\]
# grep - find all lines that match [...](...)
cat $file | \
tr '\n' ' ' | \
sed "s/)/)\n/g" | \
sed "s/^/ /g" | \
grep "[^\\]\[.*\](.*)" > ${tmp}1 || continue
# This sed will extract the href portion of the [..](..) - meaning
# the stuff in the parens.
sed "s/.*\[*\]\([^()]*\)/\1/" < ${tmp}1 > ${tmp}2 || continue
cat ${tmp}2 | while read line ; do
# Strip off the leading and trailing parens, and then spaces
ref=${line#*(}
ref=${ref%)*}
ref=$(echo $ref | sed "s/ *//" | sed "s/ *$//")
# Show all hrefs - mainly for verifying in our tests
debug "Checking: '$ref'"
# An external href (ie. starts with http)
if [ "${ref:0:4}" == "http" ]; then
try=0
while true ; do
if curl -f -s -k --connect-timeout 10 ${ref} > /dev/null 2>&1 ; then
break
fi
let try=try+1
if [ ${try} -eq ${maxRetries} ]; then
extra=""
if [ ${try} -gt 1 ]; then
extra="(tried ${try} times) "
fi
echo $file: Can\'t load url: ${ref} ${extra} | tee -a ${tmp}3
break
fi
sleep 1
done
continue
fi
# Skip "mailto:" refs
if [ "${ref:0:7}" == "mailto:" ]; then
continue
fi
# Local file link (i.e. ref contains a #)
if [[ "${ref/\#}" != "${ref}" ]]; then
# If ref doesn't start with "#" then update filepath
if [ "${ref:0:1}" != "#" ]; then
# Split ref into filepath and the section link
reffile=$(echo ${ref} | awk -F"#" '{print $1}')
fullpath=${dir}/${reffile}
ref=$(echo ${ref} | awk -F"#" '{$1=""; print $0}')
else
fullpath=${file}
ref=${ref:1}
fi
if [[ ! -e "${fullpath}" ]]; then
echo "$file: Can't find referenced file '${fullpath}'" | \
tee -a ${tmp}3
continue
fi
# Remove leading and trailing spaces
ref=$(echo ${ref} | sed 's/^[[:space:]]*//' | sed 's/[[:space:]]*$//')
# If we've seen this file before then grab its processed tmp file
if findPreviousFile "${fullpath}" ; then
anchorFile="${foundAnchor}"
else
anchorFile="${foundAnchor}"
# Search file for sections
used="" # anchors used, seen+twiddled ones
# Find all section headers in the file.
# Remove leading & trailing spaces.
# Lower case it.
# Convert spaces to "-".
# Drop all non alphanumeric chars.
# Twiddle section anchor if we've seen it before.
grep "^[[:space:]]*#" < ${fullpath} | \
sed 's/[[:space:]]*##*[[:space:]]*//' | \
sed 's/[[:space:]]*$//' | \
tr '[:upper:]' '[:lower:]' | \
sed "s/ */-/g" | \
sed "s/[^-a-zA-Z0-9]//g" | while read section ; do
# If we haven't used this exact anchor before just use it now
if [[ "${used}" != *" ${section} "* ]]; then
anchor=${section}
else
# We've used this anchor before so add "-#" to the end.
# Keep adding 1 to "#" until we find a free spot.
let num=1
while true; do
anchor="${section}-${num}"
if [[ "${used}" != *" ${anchor} "* ]]; then
break
fi
let num+=1
done
fi
echo "${anchor}"
used="${used} ${anchor} "
debug "Mapped section '${section}' to '${anchor}'"
done > ${anchorFile} || true
# Add sections of the form <a name="xxx">
grep "<a name=" <${fullpath} | \
sed 's/<a name="/\n<a name="/g' | \
sed 's/^.*<a name="\(.*\)">.*$/\1/' | \
sort | uniq >> ${anchorFile} || true
# echo sections ; cat ${tmp}sections1
fi
# Skip refs of the form #L<num> and assume its pointing to a line
# number of a file and those don't have anchors
if [[ "${ref}" =~ ^L([0-9])+$ ]]; then
continue
fi
# Finally, look for the ref in the list of sections/anchors
debug "Anchor file(${fullpath}): ${anchorFile}"
if ! grep "^${ref}$" ${anchorFile} > /dev/null 2>&1 ; then
echo $file: Can\'t find section \'\#${ref}\' in ${fullpath} | \
tee -a ${tmp}3
fi
continue
fi
newPath=${dir}/${ref}
# And finally make sure the file is there
# debug line: echo ref: $ref "->" $newPath
if [[ ! -e "${newPath}" ]]; then
echo $file: Can\'t find: ${newPath} | tee -a ${tmp}3
fi
done
done
if [ -s ${tmp}3 ]; then exit 1 ; fi