-
Notifications
You must be signed in to change notification settings - Fork 13
/
load_scripts_from_github.sql
620 lines (498 loc) · 20.5 KB
/
load_scripts_from_github.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
-- see second script contained in this file for actual functionality
CREATE SCHEMA IF NOT EXISTS EXA_TOOLBOX;
-- ****************************************
--
-- Get SQL scripts from Github
--
-- Loads scripts from public Github repos and executes the contained files so that the
-- scripts are available in the database. Only loads files ending with '.sql'
-- Needs an internet connection to work.
--
-- Parameters:
-- repository: URL of the Github repository to get, e.g. 'https://EXA_TOOLBOX.com/EXASOL/database-migration'
-- search_subfolders: if true, search also the folders contained in the repository recursively
-- exclude_folders: String containing folders that should be excluded from search, specify multiple folders
-- by separating them with a comma, e.g. 'test,secret_folder'
--
-- Returns table containing:
-- repository, e.g. 'EXASOL/database-migration/master'
-- filename, e.g. 'db2_to_exasol.sql'
-- link, e.g. 'https://raw.githubusercontent.com/EXASOL/database-migration/master/db2_to_exasol.sql'
-- content, e.g. 'create schema data...'
-- ****************************************
--/
CREATE OR REPLACE PYTHON SCALAR SCRIPT EXA_TOOLBOX.GITHUB_LOAD_SCRIPTS_GET_CONTENT("repo" varchar(4096), "search_subfolders" boolean, "exclude_folders" varchar(4096))
EMITS ("repository" varchar(2000) UTF8, "filename" varchar(2000) UTF8, "link" varchar(2000) UTF8, "content" varchar(2000000) UTF8)
AS
import urllib2
import socket
from HTMLParser import HTMLParser
#################### Class Git Link Parser ##############################
class File_Item:
repository = ''
filename = ''
link = ''
#################### Class Git Link Parser ##############################
# This class can be used to get all the links that are in the table of a github page
class GitLinkParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.in_filelist = False
self.file_list = []
self.folder_list = []
#---------------------------------------------------------------#
def feed(self, data):
self.in_filelist = False
self.file_list = []
self.folder_list = []
HTMLParser.feed(self,data)
#---------------------------------------------------------------#
# check for element which contains all the files on github page
# this check is used to make sure not to include objects mentioned in the readme.md below filelist
def check_for_file_container(self, tag, attrs):
if tag == 'table':
for attr_name, attr_value_raw in attrs:
attr_value = str(attr_value_raw)
if attr_name =='class' and attr_value.find('files') != -1 and attr_value.find('js-navigation-container') != -1:
self.in_filelist = True
#---------------------------------------------------------------#
# should be used for tags of type <a>
# adds the title and link to raw code to file_list
def add_link_to_list(self, attrs):
title = ''
link = ''
ignoreFile = False
isSqlFile = False
isFolder = False
for attr_name, attr_value_raw in attrs:
attr_value = str(attr_value_raw)
if attr_name == 'href' and attr_value.endswith('.sql'):
isSqlFile = True
link = 'https://raw.githubusercontent.com' + attr_value.replace('blob/', '')
title = attr_value.replace('blob/', '')
if attr_name == 'href' and attr_value.find('.') == -1 and attr_value.find('tree') != -1:
isFolder = True
link = 'https://github.com' + attr_value
if attr_name == 'title' and attr_value.find('Go to parent directory') != -1:
ignoreFile = True
## after for-loop, decide whether or not to add link
if isSqlFile:
slash_pos = title.rfind('/')
item = File_Item()
item.repository = title[1:slash_pos]
item.filename = title[slash_pos+1:len(title)]
item.link = link
self.file_list.append(item)
if isFolder and (not ignoreFile):
self.folder_list.append(link)
#---------------------------------------------------------------#
def handle_starttag(self, tag, attrs):
# sets self.in_filelist to true if in container
self.check_for_file_container(tag, attrs)
# if element is a link and we are in the file-list: check for file extension
if tag == 'a' and self.in_filelist:
self.add_link_to_list(attrs)
#---------------------------------------------------------------#
# basic implementation. As all links are included in the same div, if a div ends, we do not want to read any more files until the next one starts
def handle_endtag(self, tag):
if tag == 'table':
self.in_filelist = False
################ End Class Git Link Parser ##############################
################ Start actual script ###################################
def get_folders_and_file_links_helper(repo, ctx):
data = urllib2.urlopen(repo)
contents = data.read()
parser = GitLinkParser()
parser.feed(contents)
return parser.folder_list, parser.file_list;
#---------------------------------------------------------------#
# checks whether path ends with one of the elements in input_list
# if input_list is empty, returns false
def path_ends_with(path, input_list):
if len(input_list) == 0 or len(input_list[0]) == 0:
return False
for foldername in input_list:
if path.endswith('/'+ foldername):
return True
#---------------------------------------------------------------#
def get_files(repo, search_recursive, exclude_folders, ctx):
folders_to_search = []
searched_folders = []
files = []
folders_to_search.append(repo)
while (len(folders_to_search) > 0):
curr_folder = folders_to_search.pop()
# do not add the same string twice
if curr_folder in searched_folders:
continue
# continue if curr_folder should be excluded
if path_ends_with(curr_folder, exclude_folders):
continue
searched_folders.append(curr_folder)
# ctx.emit('DEBUG: Current folder', curr_folder, '', '')
new_folders, new_files = get_folders_and_file_links_helper(curr_folder, ctx)
if(search_recursive):
folders_to_search.extend(new_folders)
files.extend(new_files)
return files
#---------------------------------------------------------------#
import ctypes
import os
def run(ctx):
exclude_folders = str(ctx.exclude_folders).split(',')
try:
file_list = get_files(ctx.repo, ctx.search_subfolders,exclude_folders, ctx)
except urllib2.URLError as e:
raise Exception('Could not establish a connection to Github. Please make sure your database has a connection to the internet.')
for item in file_list:
try:
data = urllib2.urlopen(item.link)
content = data.read()
ctx.emit(item.repository, item.filename, item.link, content)
except Exception as e:
ctx.emit('ERROR on '+ item.link,str(e),'', '')
/
--SELECT EXA_TOOLBOX.GITHUB_LOAD_SCRIPTS_GET_CONTENT('https://github.com/EXASOL/database-migration', true, 'test') res FROM dual;
-- SELECT EXA_TOOLBOX.GITHUB_LOAD_SCRIPTS_GET_CONTENT('https://github.com/EXASOL/virtual-schemas', true) res FROM dual;
-- ****************************************
--
-- Load scripts from Github
--
-- Loads scripts from public Github repos and executes the contained files so that the
-- scripts are available in the database. Needs an internet connection to work.
--
-- Parameters:
-- repository: URL of the Github repository to search, e.g. 'https://EXA_TOOLBOX.com/EXASOL/database-migration'
-- file_filter: only files that match the file filter will be loaded into the database. if left blank, all files are loaded. e.g. '' or 'to_exasol'
-- files_to_exclude: Comma seperated list of files you don't want to execute even though they match the file_filter, e.g. 'Example.sql,Dummy.sql'
-- search_recursive: if true, search also the folders contained in the repository recursively
-- exclude_folders: String containing folders that should be excluded from search, specify multiple folders
-- by separating them with a comma, e.g. 'test,secret_folder'
--
-- Returns:
-- Table containing information on which files were loaded
--
-- ****************************************
--/
CREATE OR REPLACE LUA SCRIPT EXA_TOOLBOX.GITHUB_LOAD_SCRIPTS(repository, file_filter, files_to_exclude, search_recursive, exclude_folders) RETURNS TABLE AS
-- checks whether the content of tokenlist starting at strt is a script
-- returns: String - statementType, String - scriptType, String -scriptName, String -scriptComment
function getStmtType(tokenlist, strt)
stmtType = 'STATEMENT'
scriptType = 'LUA' -- LUA is the default, other possible values are: JAVA, PYTHON, R
scriptComment = '' -- Collect comments before 'CREATE'-keyword in this String
-- search for first token that is no comment or whitespace
while( sqlparsing.iswhitespaceorcomment(tokenlist[strt]) and strt < #tokenlist) do
scriptComment = scriptComment .. tokenlist[strt]
strt = strt + 1
end
if(sqlparsing.normalize(tokenlist[strt]) ~= "CREATE") then
return stmtType, 'NONE' , '', ''
end
-- after a create, the next 7 tokens must contain 'script' in order to be a proper script
-- Maximum number of tokens between CREATE and SCRIPT: 7
-- CREATE OR REPLACE SCALAR LUA SCRIPT
nr_search_to = math.min(#tokenlist, strt + 7)
output('search from '..strt..' until token '..nr_search_to)
-- use while-loop instead of for-loop here because nr_search_to gets changed in loop
i = strt + 1;
while( i <= nr_search_to) do
-- determine script type. must be explicitly given if other than lua
if (tokenlist[i] == 'PYTHON') or (tokenlist[i] == 'JAVA') or (tokenlist[i] == 'R') then
scriptType = tokenlist[i]
end
-- if there's a comment token, enlarge the search space by one
if(sqlparsing.iswhitespaceorcomment(tokenlist[i]) ) then
if (nr_search_to < #tokenlist) then
output('Whitespace or comment in script search space --> skip token and enlarge search space by one')
nr_search_to = nr_search_to +1
end
else
if(string.find(tokenlist[i], ';')) then
output('found semicolon before "SCRIPT"-keyword --> no script')
return stmtType, 'NONE', '', ''
end
output('search for "SCRIPT" or "FUNCTION" in '.. sqlparsing.normalize(tokenlist[i]))
tok = sqlparsing.normalize(tokenlist[i])
if (tok == "SCRIPT" or tok == "FUNCTION") then
stmtType = tok
-- find token containing script name
name_search_pos = i + 1
while( (not sqlparsing.isidentifier(tokenlist[name_search_pos])) and name_search_pos < #tokenlist) do
name_search_pos = name_search_pos + 1
end
scriptName = tokenlist[name_search_pos]
return stmtType, scriptType, scriptName, scriptComment
end
end
i = i +1
end -- end while-loop
return stmtType, 'NONE', '', ''
end
---------------------------------------------------------------------------------
-- get rid of whitespaces and tabs
-- then check if remaining token start with a newline
function starts_with_newline(token)
t = string.gsub(token, ' ', '')
t = string.gsub(t, ' ', '')
t = string.gsub(t, '\n+', '\n')
nl = '\n'
return t:sub(1, #nl) == nl
end
-- check if last token is a newline
function ends_with_newline(token)
nl = '\n'
return token:sub(-#nl) == nl
end
---------------------------------------------------------------------------------
-- returns token number of '/' if tokenstring contains newline followed by '/'
function findScriptEnd(tokenlist, startToken)
for i = startToken + 1, #tokenlist do
if (ends_with_newline(tokenlist[i-1]) and tokenlist[i] == '/') or tokenlist[i] == '\n/\n' then
-- check if the slash is really the end of the script. this is the case if the slash is either
-- the last token or there are only spaces, tabs and one newline in the next one
if(i+1 > #tokenlist or (i+1 <= #tokenlist and starts_with_newline(tokenlist[i+1]))) then
return i
end
end
end
return nil
end
---------------------------------------------------------------------------------
-- split by delimiter into array and include delimiter also in the array
function split(str, delim)
-- Eliminate bad cases...
if string.find(str, delim) == nil then
return { str }
end
local result = {}
local pat = "(.-)" .. delim .. "()"
local nb = 0
local lastPos
for part, pos in string.gfind(str, pat) do
nb = nb + 1
result[nb] = part
nb = nb + 1
result[nb] = delim
lastPos = pos
end
-- Handle the last field
rest = string.sub(str, lastPos)
if( rest ~= nil and #rest ~= 0) then
result[nb + 1] = rest
end
return result
end
---------------------------------------------------------------------------------
-- create a new tokenlist that is also splitted by CRLF / CRLF
function splitBySlash(tokenlist)
-- create a string that contains a CRLF
scriptEndToken = '\n/\n'
new_tokenlist = {}
for i = 1, #tokenlist do
if string.find(tokenlist[i],scriptEndToken) then
splitted = split(tokenlist[i], scriptEndToken)
for j = 1, #splitted do
table.insert(new_tokenlist, splitted[j])
end
else
table.insert(new_tokenlist, tokenlist[i])
end
end
return new_tokenlist
end
---------------------------------------------------------------------------------
-- returns a table containing an entry for each statement of script file, each row consists of:
-- stmt, stmtType, scriptName, scriptComment
function getStatements(script_file)
statements = {}
tokenlist = sqlparsing.tokenize(script_file)
tokenlist = splitBySlash(tokenlist)
startTokenNr = 1
searchForward = true
searchSameLevel = false
ignoreFunction = sqlparsing.iswhitespaceorcomment
stmtEnd = ';'
skriptStart= 'CREATE'
-- Debugging --------
-- output('---- TOKENLIST START ----')
-- for i = 1, #tokenlist do
-- output(i..' '..tokenlist[i])
-- end
-- output('---- TOKENLIST END ----')
-- Debugging --------
while startTokenNr <= #tokenlist do
-- check if the next statement is a script
stmtType, scriptType, scriptName, scriptComment = getStmtType(tokenlist, startTokenNr)
if (isScriptOrFunction(stmtType)) then
output('---> is script. Search for / starting at '.. startTokenNr)
-- check whether token before / is a newline, if not, it's no proper script end
endTokenNr = findScriptEnd(tokenlist, startTokenNr)
if endTokenNr ~= nil then
output('End token nr is '..endTokenNr..' text: '..tokenlist[endTokenNr])
end
else
output('---> is NO script. Search for '..stmtEnd..' starting at '.. startTokenNr)
endTokenNr = sqlparsing.find(tokenlist, startTokenNr, searchForward, searchSameLevel, ignoreFunction, stmtEnd)
if endTokenNr ~= nil then
endTokenNr = endTokenNr[1]
output('End token nr is '..endTokenNr..' text: '..tokenlist[endTokenNr])
end
end
if endTokenNr == nil then
output('No endtoken found, setting to #tokenlist: '.. #tokenlist)
endTokenNr = #tokenlist
end
stmt = {unpack(tokenlist, startTokenNr, endTokenNr)}
stmt = table.concat(stmt, "")
table.insert(statements, {stmt, stmtType, scriptName, scriptComment})
startTokenNr = endTokenNr + 1
end
return statements
end
-- check whether input text is SCRIPT or FUNCTION
function isScriptOrFunction(stmt_type)
if(stmt_type == 'SCRIPT' or stmt_type == 'FUNCTION') then
return true
else
return false
end
end
-- executes all statements contained in script_statements ahead of first script
-- and adds statements contained in script_statements after script as a comment to the script
function execute_scripts_with_comments(filename, script_statements)
info = {}
execute_this = true
script_name = ''
----------------
-- script_statements[j][2] contains: stmt_is_script
-- all statemtns before first script: put execute flag
j = 1
while (j <= #script_statements and (isScriptOrFunction(script_statements[j][2]) == false)) do
stmt = script_statements[j][1]
stmt_type = script_statements[j][2]
stmt_name = script_statements[j][3]
execute_this = true
add_as_comment = ''
executed = ''
table.insert(info, {filename, stmt_name, stmt, stmt_type, execute_this, add_as_comment, executed})
j = j +1
end
-- this part is executed after for all the scripts
while (j <= #script_statements) do
stmt = script_statements[j][1]
stmt_type = script_statements[j][2]
stmt_name = script_statements[j][3]
add_as_comment = script_statements[j][4]
execute_this = false
executed = ''
if ( isScriptOrFunction(stmt_type)) then
execute_this = true
-- collect all the comments, this are all statements where is_script is false
c = j + 1
while (c <= #script_statements and (not script_statements[c][2])) do
comment_stmt = script_statements[c][1]
add_as_comment = add_as_comment .. comment_stmt
c = c + 1
end
end
table.insert(info, {filename, stmt_name, stmt, stmt_type, execute_this, add_as_comment, executed})
j = j +1
end
-- for-loop to actually execute the statements collected in info-table
for j = 1, #info do
stmt_name = info[j][2]
stmt = info[j][3]
stmt_type = info[j][4]
execute_this = info[j][5]
add_as_comment = info[j][6]
executed_this_stmt = ''
if ( execute_this) then
stmt_suc, stmt_res = pquery(stmt)
if stmt_suc then
executed_this_stmt = 'YES'
else
executed_this_stmt = 'FAILED: '.. stmt_res.error_message
end
end
if (add_as_comment ~= '') then
stmt_suc, stmt_res = pquery([[comment on ]]..stmt_type..[[ ::s is :c]], {s=stmt_name, c=add_as_comment})
if stmt_suc then
executed_this_stmt = executed_this_stmt ..' , and commented'
else
executed_this_stmt = executed_this_stmt .. ', failed to comment: '.. stmt_res.error_message
end
end
info[j][7] = executed_this_stmt
end -- end for-loop
return info
end
function string.startswith(String,Start)
return string.sub(String,1,string.len(Start))==Start
end
-- append rows of table t2 to t1
function appendTable(t1,t2)
for i=1,#t2 do
t1[#t1+1] = t2[i]
end
return t1
end
-- check whether filename is contained in name_list
function file_in_list(filename,name_list)
for i=1, #name_list do
if filename == name_list[i] then
return true
end
end
return false
end
---------------------------------------------------------------------------------
------------------------- actual script -----------------------------------------
info_out = {}
info_detailed = {}
execute_all_statements = false
execute__stmts_before_script = true
if(files_to_exclude == null) then
files_to_exclude = {}
else
files_to_exclude = split(files_to_exclude, ',')
end
suc, res = pquery([[SELECT EXA_TOOLBOX.GITHUB_LOAD_SCRIPTS_GET_CONTENT(:r, :rec, :exc) res FROM dual]], {r=repository, rec=search_recursive, exc=exclude_folders})
--suc, res = pquery([[SELECT 'repo', filename, link, content from test.gitFiles where id = 4]])
if not suc then
error(res.error_message)
end
if #res == 0 then
error('Could not retreive files from github, maybe the url or file_filter is wrong?')
end
for i = 1,#res do
repository = res[i][1]
filename = res[i][2]
link = res[i][3]
content = res[i][4]
-- check for errors: if an error occured in the python script, repository will start with 'ERROR'
if string.startswith(repository, 'ERROR') then
exit('Problem retrieving files: ' .. repository)
end
-- check whether file_filter machtes and file is not ignored
if (file_filter == null or string.find(filename, file_filter)) and (files_to_exclude == null or (not file_in_list(filename,files_to_exclude))) then
script_statements = getStatements(content)
new_info = execute_scripts_with_comments(filename, script_statements)
info_detailed = appendTable(info_detailed, new_info)
--table.insert(info_out,{repository, filename, 'Executed'} )
end -- end if
end -- end outer for-loop
-- depending on detail level, use exit(info_out, ...) or exit(info_detailed, ... )
--exit(info_out, "Repo varchar(2000000), filename varchar (2000000), executed varchar(20000)")
exit(info_detailed, "file_name varchar(2000000), stmt_name varchar(2000000), stmt varchar(2000000), stmt_type varchar(10), should_be_executed boolean, comment_on_this varchar(20000), executed varchar(20000)")
/
--repository, file_filter, search_recursive
EXECUTE SCRIPT EXA_TOOLBOX.GITHUB_LOAD_SCRIPTS(
'https://github.com/EXASOL/exa-toolbox' -- repository
, '' -- file_filter
, 'Example.sql,load_scripts_from_github.sql' -- files_to_exclude
, true -- search_recursive
, 'test' -- exclude this folder from search
)
--with output
;