forked from RicardoMoya/Example_Pandas
-
Notifications
You must be signed in to change notification settings - Fork 47
/
DataFrame_pivot_table.py
58 lines (42 loc) · 2.17 KB
/
DataFrame_pivot_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
__author__ = 'Who ?'
import pandas as pd
import numpy as np
# Load Data
userHeader = ['user_id', 'gender', 'age', 'ocupation', 'zip']
users = pd.read_csv('dataSet/users.txt', engine='python',
sep='::', header=None, names=userHeader)
movieHeader = ['movie_id', 'title', 'genders']
movies = pd.read_csv('dataSet/movies.txt', engine='python',
sep='::', header=None, names=movieHeader)
ratingHeader = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('dataSet/ratings.txt', engine='python',
sep='::', header=None, names=ratingHeader)
# Merge data
mergeRatings = pd.merge(pd.merge(users, ratings), movies)
# Clone DataFrame
def cloneDF(df):
return pd.DataFrame(df.values.copy(), df.index.copy(), df.columns.copy()).convert_objects(convert_numeric=True)
# Simple pivot (Groupby + avg)
df_1 = cloneDF(mergeRatings)
df_1 = df_1.pivot_table(index=['movie_id', 'title'])
print('Columns(movie_id + title) to Index: \n%s' % df_1[:5])
print('\n==================================================================\n')
# Pivot by specific columns (Groupby + avg by rating and age)
df_2 = cloneDF(mergeRatings)
df_2 = df_2.pivot_table(index=['movie_id', 'title'], values=['rating', 'age'])
print('Columns(movie_id + title) to Index and avg by values \n%s' % df_2[:5])
print('\n==================================================================\n')
# Pivot by specific columns, applying a functions
df_3 = cloneDF(mergeRatings)
df_3 = df_3.pivot_table(index=['movie_id', 'title'], values=[
'rating'], aggfunc=[np.sum, np.size, np.mean])
print('Columns(movie_id + title) to Index and specific functions by values \n%s' %
df_3[:5])
print('\n==================================================================\n')
# Pivot by specific index, specific values and specific columns, applying a functions
df_4 = cloneDF(mergeRatings)
df_4 = df_4.pivot_table(index=['movie_id', 'title'], values=['rating'], columns=['gender'], aggfunc=[np.mean],
fill_value=-1, margins=True)
print('Columns(movie_id + title) to Index and avg rating applied by gender \n%s' %
df_4[:5])