-
Notifications
You must be signed in to change notification settings - Fork 0
/
sales_HDFS.txt
45 lines (34 loc) · 1.06 KB
/
sales_HDFS.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
---------------------------------------------------------------
mapper.py
#!/usr/bin/python
import sys
for line in sys.stdin:
data = line.strip().split("\t")
if len(data) == 6:
date, time, store, item, cost, payment = data
print ("{0}\t{1}".format(store, cost))
----------------------------------------------------------------
reducer.py
#!/usr/bin/python
import sys
salesTotal = 0
oldKey = None
for line in sys.stdin:
data_mapped = line.strip().split("\t")
if len(data_mapped) != 2:
continue
thisKey, thisSale = data_mapped
if oldKey and oldKey != thisKey:
print(oldKey, "\t", salesTotal)
oldKey = thisKey
salesTotal = 0
oldKey = thisKey
salesTotal += float(thisSale)
if oldKey is not None:
print(oldKey, "\t", salesTotal)
> hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-input /user/cloudera/input \
-output /user/cloudera/output \
-mapper /home/cloudera/mapper.py \
-reducer /home/cloudera/reducer.py
>hdfs dfsadmin -safemode leave