-
Notifications
You must be signed in to change notification settings - Fork 0
/
mysql_cages_table_loader.py
53 lines (44 loc) · 2.19 KB
/
mysql_cages_table_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python
import pandas as pd
from sqlalchemy import create_engine
#from sqlalchemy.dialects.mysql import LONGTEXT
import sys
try:
input_file = sys.argv[1]
except:
sys.exit('<input_tsv_gz_file>')
df = pd.read_csv(input_file, compression="gzip", sep="\t", low_memory=False)
#df.rename(columns = {'Gene_name/Gene_ID':'Gene_name_Gene_ID'}, inplace = True)
#df = df.sort_values(by=['Gene_name_Gene_ID'])
engine = create_engine("mysql+pymysql://{user}:{pw}@127.0.0.1:{port}/?charset=utf8".format(user="root",
pw="", port=3307))
with engine.connect() as connection:
avail_db = connection.execute('SHOW DATABASES').fetchall()
avail_db = map(lambda x: str(x[0]), avail_db)
if not 'Cages_tables' in avail_db:
print 'Cages_tables db not available, I\'m gonna creating it..'
connection.execute('CREATE DATABASE Cages_tables')
count_row = df.shape[0]
print 'Your df {} contains {} rows'.format(input_file, count_row)
print 'Going to upload its content...'
engine = create_engine("mysql+pymysql://{user}:{pw}@127.0.0.1:{port}/{db}?charset=utf8".format(user="root",
pw="", port=3307, db="Cages_tables"))
#dtype = {
# "Editing_events": LONGTEXT
#}
sql_table = input_file.replace('.cages.gz','').replace('.','_').replace('-','_') + '_cages'
df.to_sql(sql_table, con=engine, if_exists='replace', index=False, chunksize=1000)#, dtype=dtype)
sql_count = 'SELECT COUNT(*) FROM {}'.format(sql_table).rstrip(',')
print 'Your uploaded utrs_table contains {} rows'.format(list(engine.execute(sql_count))[0][0])
print '{} Succesfully UPLOADED!'.format(sql_table)
print 'Compressing {}'.format(sql_table)
cmd = "ALTER TABLE `{}` ROW_FORMAT=COMPRESSED;".format(sql_table)
engine.execute(cmd)
print 'Compression EXECUTED, ALL DONE for {}!'.format(sql_table)
print 'Indexing {}'.format(sql_table)
#cmd_index = "CREATE UNIQUE INDEX transcript_id ON {} (Transcript_id(255))".format(sql_table)
cmd_index = "CREATE INDEX transcript_id ON {} (Transcript_id(255))".format(sql_table)
engine.execute(cmd_index)
print 'Indexing EXECUTED, ALL DONE for {}!'.format(sql_table)
engine.dispose()
print '-------------------------------------------------------------------------------------'