Skip to content

Commit

Permalink
added lab4 and lab6
Browse files Browse the repository at this point in the history
  • Loading branch information
nimay-gupta committed Mar 25, 2021
1 parent 79aaeef commit 1cf462c
Show file tree
Hide file tree
Showing 22 changed files with 1,011 additions and 1 deletion.
30 changes: 30 additions & 0 deletions Lab4-SQL/1_b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import csv

table = "COVID"
csv_files = ["data1.csv","data2.csv","data3.csv","data4.csv","data5.csv"]

for csv_filename in csv_files:
sql_filename = csv_filename.split('.')[0] + ".sql"
sql_file = open(sql_filename, "w")
sql_file.close()
sql_file = open(sql_filename, "a")
sql_file.write("begin;\n")
sql_file.write("delete from " + table + ";\n")

with open(csv_filename, 'r') as csvfile:
csvreader = csv.reader(csvfile)
header = next(csvreader)
for row in csvreader:
sql_file.write("insert into " + table + " values (")
n = len(row)
for i in range(n):
if row[i] in ["NULL", ""]:
sql_file.write("NULL")
else:
sql_file.write("'" + row[i] + "'")
if i < n-1:
sql_file.write(", ")
sql_file.write(");\n")

sql_file.write("end;\n")
sql_file.close()
24 changes: 24 additions & 0 deletions Lab4-SQL/1_c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import psycopg2, config, time

table = "COVID"
csv_files = ["data1.csv","data2.csv","data3.csv","data4.csv","data5.csv"]

for file in csv_files:
conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()
sql = "delete from " + table + ";"
cur.execute(sql)
cur.close()
conn.commit()
conn.close()
t1 = time.time()
conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()
sql = "copy " + table + " from STDIN csv header;"
f = open(file, "r")
cur.copy_expert(sql, f)
cur.close()
conn.commit()
conn.close()
t2 = time.time()
print(file, " ", t2-t1)
36 changes: 36 additions & 0 deletions Lab4-SQL/1_d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import psycopg2, config, csv, time

table = "COVID"
csv_files = ["data1.csv","data2.csv","data3.csv","data4.csv","data5.csv"]

for file in csv_files:
conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()
sql = "delete from " + table + ";"
cur.execute(sql)
cur.close()
conn.commit()
conn.close()
t1 = time.time()
conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()

with open(file, 'r') as csvfile:
csvreader = csv.reader(csvfile)
header = next(csvreader)

for row in csvreader:
sql = "insert into " + table + " values ("
n = len(row)
for i in range(n):
sql += "'" + row[i] + "'"
if i < n-1:
sql += ", "
sql += ");"
cur.execute(sql)

cur.close()
conn.commit()
conn.close()
t2 = time.time()
print(file, " ", t2-t1)
36 changes: 36 additions & 0 deletions Lab4-SQL/1_e.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import psycopg2, config, csv, time

table = "COVID"
csv_files = ["data6.csv","data7.csv","data8.csv","data9.csv","data10.csv"]

for file in csv_files:
conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()
sql = "delete from " + table + ";"
cur.execute(sql)
cur.close()
conn.commit()
conn.close()
t1 = time.time()

with open(file, 'r') as csvfile:
csvreader = csv.reader(csvfile)
header = next(csvreader)

for row in csvreader:
sql = "insert into " + table + " values ("
n = len(row)
for i in range(n):
sql += "'" + row[i] + "'"
if i < n-1:
sql += ", "
sql += ");"
conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()
cur.execute(sql)
cur.close()
conn.commit()
conn.close()

t2 = time.time()
print(file, " ", t2-t1)
32 changes: 32 additions & 0 deletions Lab4-SQL/2_a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import psycopg2, config, time
from matplotlib import pyplot as plt

table = "COVID"
num_rows = 100
x = []
y = []

conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()

for i in range(100000//num_rows):
sql = "select * from " + table + " limit " + str(num_rows) + " offset " + str(i*num_rows) + ";"
t1 = time.time()
cur.execute(sql)
rows = cur.fetchall()
t2 = time.time()
t = t2-t1
if t > 0:
x += [i+1]
y += [t]

bin = 40
x = [sum(x[i:i+bin])/bin for i in range(0,len(x),bin)]
y = [sum(y[i:i+bin])/bin for i in range(0,len(y),bin)]

cur.close()
conn.close()
plt.plot(x,y)
plt.xlabel("iteration")
plt.ylabel("time (s)")
plt.show()
33 changes: 33 additions & 0 deletions Lab4-SQL/2_b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import psycopg2, config, time
from matplotlib import pyplot as plt

table = "COVID"
num_rows = 100
x = []
y = []

conn = psycopg2.connect(database=config.name, user=config.user, password=config.pswd, host=config.host, port=config.port)
cur = conn.cursor()

sql = "select * from " + table + ";"
cur.execute(sql)

for i in range(100000//num_rows):
t1 = time.time()
rows = cur.fetchmany(num_rows)
t2 = time.time()
t = t2-t1
if t > 0 or True:
x += [i+1]
y += [t]

bin = 40
x = [sum(x[i:i+bin])/bin for i in range(0,len(x),bin)]
y = [sum(y[i:i+bin])/bin for i in range(0,len(y),bin)]

cur.close()
conn.close()
plt.plot(x,y)
plt.xlabel("iteration")
plt.ylabel("time (s)")
plt.show()
Binary file added Lab4-SQL/Problem Statement.pdf
Binary file not shown.
5 changes: 5 additions & 0 deletions Lab4-SQL/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Benchmarking different methods in PostgreSQL

* used psycopg2 library to connect to database via python
* comparing bulk loading with individual INSERT statements wrt time-taken
* using CURSORS to reduce fetch time of bulk queries
Binary file added Lab4-SQL/Report.pdf
Binary file not shown.
5 changes: 5 additions & 0 deletions Lab4-SQL/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name = "lab4db"
user = "postgres"
pswd = "1234"
host = "127.0.0.1"
port = "5432"
Binary file added Lab6-Neo4j/Problem Statement.pdf
Binary file not shown.
4 changes: 4 additions & 0 deletions Lab6-Neo4j/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Graph Databases using Neo4j

* loading Twitter dataset (in CSV format) using Python
* querying the database with Cypher Query Language
Loading

0 comments on commit 1cf462c

Please sign in to comment.