-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordc_HDFS.txt
125 lines (103 loc) · 3.33 KB
/
wordc_HDFS.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
> gedit mapper.py
> gedit reducer.py
Mapper.py
#!/usr/bin/python
import sys
import string
for line in sys.stdin:
words=line.split(" ")
for word in words:
word=word.strip("\n")
if word:
print(word + ":1")
----------------------------------------------
Reducer.py
#!/usr/bin/python
import sys
import string
oldkey = ""
count = 0
for line in sys.stdin:
words = line.split(":")
key = words[0].strip()
val = words[1].strip()
if(oldkey==""):
oldkey=key
if(oldkey!=key):
print(oldkey+":"+str(count))
oldkey=key
count=0
count=count+1
----------------------------------------------
> more mapper.py
> more reducer.py
> chmod +x mapper.py
> chmod +x reducer.py
> pwd
> echo "A long time ago in a galaxy far far away" > /home/cloudera/testfile1
>hdfs dfs -mkdir /user/cloudera/input
>hdfs dfs -put /home/cloudera/testfile1 /user/cloudera/input
>hdfs dfs -put /home/cloudera/testfile2 /user/cloudera/input
>hdfs dfs -ls /user/cloudera/input
> hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-input /user/cloudera/input \
-output /user/cloudera/output \
-mapper /home/cloudera/mapper.py \
-reducer /home/cloudera/reducer.py
>To leave safe mode
>hdfs dfsadmin -safemode leave
> To see the result from Hadoop output folder.
> hdfs dfs -cat /user/cloudera/output/part-00000
> hdfs dfs -rm -R output
> hdfs dfs -ls /user/cloudera/input
> chmod +x mapper.py
> chmod +x reducer.py
> hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
-input /user/cloudera/input \
-output /user/cloudera/output \
-mapper /home/cloudera/mapper.py \
-reducer /home/cloudera/reducer.py
> hdfs dfs -cat /user/cloudera/output/part-00000
-------------------------------------------------------------------------------------------------------------------------------
Program 2:
input.txt
2012-01-01 09:00 San Jose Men's Clothing 214.05 Amex
2012-01-01 09:00 Fort Worth Women's Clothing 153.57 Visa
2012-01-01 09:00 San Diego Music66.08 Cash
:------------------------------------------------------------------- # Mapper.py
#!/usr/bin/python
# Format of each line is:
# date\ttime\tstore name\titem description\tcost\tmethod of payment
#
# We want elements 2 (store name) and 4 (cost)
# We need to write them out to standard output, separated by a tab
import sys
for line in sys.stdin:
data = line.strip().split("\t")
if len(data) == 6:
date, time, store, item, cost, payment = data
print ("{0}\t{1}".format(store, cost))
------------------------------------------------------------------------ #Reducer.py
#!/usr/bin/python
import sys
salesTotal = 0
oldKey = None
# Loop around the data.
# It will be in the format key \t val
# Where key is the store name, val is the sale amount
# All the sales for a particular store will be presented,
# then the key will change and we'll be dealing with the next store
for line in sys.stdin:
data_mapped = line.strip().split("\t")
if len(data_mapped) != 2:
# Something has gone wrong. Skip this line.
continue
thisKey, thisSale = data_mapped
if oldKey and oldKey != thisKey:
print (oldKey, "\t", salesTotal)
oldKey = thisKey;
salesTotal = 0
oldKey = thisKey
salesTotal += float(thisSale)
if oldKey != None:
print (oldKey, "\t", salesTotal)