-
Notifications
You must be signed in to change notification settings - Fork 0
/
calculate_language_changes.py
156 lines (113 loc) · 4.65 KB
/
calculate_language_changes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 7 15:14:17 2023
@author: TuoVaisanen-e01
"""
import pandas as pd
import geopandas as gpd
from collections import Counter
import skbio.diversity.alpha as sk
import numpy as np
import gc
from matplotlib import pyplot as plt
import seaborn as sns
import argparse
# set up argument parser
ap = argparse.ArgumentParser()
# Get path to input file
ap.add_argument("-g", "--grid", required=True,
help="Path to geopackage file containing 250 m grid covering the HMA.")
# Get path to input file
ap.add_argument("-lg", "--langgrid", required=True,
help="Path to folder containing CSVs with first language information and individual unique ids")
# Get path to input file
ap.add_argument("-hg", "--homegrid", required=True,
help="Path to folder containing CSVs with home locations and individual unique ids")
# Get path to output file
ap.add_argument("-o", "--output", required=True,
help="Path to output folder. For example: /path/to/folder/. The files will be named: 'HMA_individuals_lang_[YEAR]_250m.pkl' and 'HMA_individuals_lang_changes_1987-2019.pkl'")
# parse arguments
args = vars(ap.parse_args())
# set seaborn style
sns.set()
# read hma grid data in
hma = gpd.read_file(args['grid'])
# convert grid id to integer
hma['NRO'] = hma['NRO'].astype(int)
# get hma grid id list
hma_gids = list(hma['NRO'].values)
# list holding dataframes with language information
langlist = []
# loop over the years
for i in range(1987, 2020):
# print message
print('[INFO] - Processing year ' + str(i))
# read data for year
langs = pd.read_csv(args['langgrid'] + '{}_mothertongues.csv'.format(i),
sep=',', encoding='utf-8')
homes = pd.read_csv(args['homegrid'] + 'henkilo_paikkatiedot_{}.csv'.format(i),
sep=',', encoding='utf-8')
# drop individuals without home grid id
homes = homes.dropna(subset=['euref_250'])
# convert grid id to integer
homes['euref_250'] = homes['euref_250'].astype(int)
# get individuals who only live in the HMA
homes = homes[homes['euref_250'].isin(hma_gids)]
# get languages of individuals who live in the HMA
langs = pd.merge(homes[['shnro', 'euref_250']], langs[['shnro', 'kieli']],
how='left', on='shnro')
# add year
langs['year'] = i
# save to disk
langs.to_pickle(args['output'] + 'HMA_individuals_lang_{}_250m.pkl'.format(str(i)))
# get column new column name
colname = 'lang' + str(i)
# rename column
langs = langs.rename(columns={'kieli':colname})
# reindex column to shnro
langs = langs.set_index('shnro')
# append to list
langlist.append(langs[colname])
# print message
print('[INFO] - Lists appended')
# join dataframes
langhistory = pd.concat(langlist, axis=1)
print('[INFO] - Lists concatenated into a dataframe')
# drop rows with only one obseration
langhistory = langhistory.dropna(thresh=1)
# delete langlist to release memory
del langlist
del hma
del langs
del homes
del hma_gids
gc.collect()
# empty dataframe to hold results
result = pd.DataFrame(columns=['shnro','lang_hist','change_n','change'])
# print message
print('[INFO] - Calculating changes in registered languages....')
# detect changes
for i, row in langhistory.iterrows():
# get language history of individual
individual = row.dropna().values
# get ordered set of language history (shows language change direction)
individual = list(dict.fromkeys(individual).keys())
# save only those who have changed
if len(individual) > 1:
# create one row dataframe
predf = pd.DataFrame({'shnro':[i],
'lang_hist':[individual],
'change_n':[len(individual)],
'change': [True if len(individual) > 1 else False]})
# add values to dataframe
result = result.append(predf, ignore_index=True)
else:
pass
# save resulting dataframe
result.to_pickle(args['output'] + 'HMA_individuals_lang_changes_1987-2019.pkl')
print('[INFO] - Results saved to pickle!')
# print results
print('\n[INFO] - Top 30 share of language changes from all inhabitants: \n')
print((result['lang_hist'].value_counts()[:30] / len(result)) * 100)
print('\n[INFO] - Top 30 share of final language of all changes: \n')
print((result['lang_hist'].apply(lambda x: x[-1]).value_counts()[:30] / len(result)) * 100)