-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean_data.py
69 lines (61 loc) · 2.72 KB
/
clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import pandas as pd
import json
from datetime import date, timedelta, datetime
import flatten_json
import csv
import os
import extract_data
# df = pd.DataFrame()
def open_json():
# json_file = open(extract_data.extracted_data_fn, "r")
with open(extract_data.extracted_data_fn) as json_file:
# with open(f'rentsmart_json_2022-12-29.json') as json_file:
# store file data in object
json_data = json.load(json_file)
print('json data loaded')
print (json_data)
df = pd.json_normalize(json_data['result']['records'])
print ('raw json transformed to pandas DataFrame:\n', df.head(5))
return df
def clean_data():
#rename columns
df = open_json()
df = df.rename(columns = {'_id':'id', 'date':'datetime', 'year built':'year_built'
, 'year remodeled':'year_remodeled'})
#clean zip codes
df['zip_code'] = df['zip_code'].apply(lambda x: '0' + str(x) if len(x) == 4 else str(x))
#create date column from preexisting datetime
df['date'] = df['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))
df['date'] = df['date'].apply(lambda x: datetime.date(x))
df['geo_coordinates'] = df['latitude'] + ', ' + df['longitude']
#reorder
df = df[['id', 'date', 'datetime', 'property_type', 'violation_type', 'description', 'owner', 'year_built'
, 'year_remodeled', 'neighborhood', 'address', 'zip_code', 'parcel', 'geo_coordinates', 'latitude', 'longitude']]
# df
print('data transformation complete:\n', df.head(5))
# return(dataframe)
# df = clean_data(df)
# df
# #################### checks in terminal ####################
# import pandas as pd
# import json
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# extracted_data_fn = f'rentsmart_json_2022-12-29.json'
# json_file = open(extracted_data_fn, "r")
# json_data = json.load(json_file)
# json_data['result']['fields'][0].keys()
# json_data
# df = pd.json_normalize(json_data['result']['records'])
# print(df)
# df = df.rename(columns = {'_id':'id', 'date':'datetime', 'year built':'year_built'
# , 'year remodeled':'year_remodeled'})
# df['zip_code'] = df['zip_code'].apply(lambda x: '0' + str(x) if len(x) == 4 else str(x))
# df['date'] = df['datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%S'))
# df['date'] = df['date'].apply(lambda x: datetime.datetime.date(x))
# df = df[['id', 'date', 'datetime', 'property_type', 'violation_type', 'description', 'owner', 'year_built'
# , 'year_remodeled', 'neighborhood', 'address', 'zip_code', 'parcel', 'latitude', 'longitude']]
# df
# df.to_csv(f'rentsmart_flattened_data_2022-23-22.csv', index = False)
# df.to_csv(f'rentsmart_flattened_data_{yesterday}.csv', index=False)
# # column keys