wordc_HDFS.txt

> gedit mapper.py
> gedit reducer.py

Mapper.py

#!/usr/bin/python
import sys
import string
for line in sys.stdin:
    words=line.split(" ")
    for word in words:
        word=word.strip("\n")
    if word:
        print(word + ":1")


----------------------------------------------
Reducer.py

#!/usr/bin/python
import sys
import string
oldkey = ""
count = 0
for line in sys.stdin:
    words = line.split(":")
    key = words[0].strip()
    val = words[1].strip()
    if(oldkey==""):
        oldkey=key
    if(oldkey!=key):
        print(oldkey+":"+str(count))
        oldkey=key
        count=0
    count=count+1

----------------------------------------------

> more mapper.py
> more reducer.py

> chmod +x mapper.py
> chmod +x reducer.py

> pwd

> echo "A long time ago in a galaxy far far away" > /home/cloudera/testfile1

>hdfs dfs -mkdir /user/cloudera/input

>hdfs dfs -put /home/cloudera/testfile1 /user/cloudera/input
>hdfs dfs -put /home/cloudera/testfile2 /user/cloudera/input

>hdfs dfs -ls /user/cloudera/input

> hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
 -input /user/cloudera/input \
 -output /user/cloudera/output \
 -mapper /home/cloudera/mapper.py \
 -reducer /home/cloudera/reducer.py

>To leave safe mode
>hdfs dfsadmin -safemode leave

> To see the result from Hadoop output folder.
> hdfs dfs -cat /user/cloudera/output/part-00000

> hdfs dfs -rm -R output

> hdfs dfs -ls /user/cloudera/input
> chmod +x mapper.py
> chmod +x reducer.py
> hadoop jar /usr/lib/hadoop-mapreduce/hadoop-streaming.jar \
 -input /user/cloudera/input \
 -output /user/cloudera/output \
 -mapper /home/cloudera/mapper.py \
 -reducer /home/cloudera/reducer.py

> hdfs dfs -cat /user/cloudera/output/part-00000

-------------------------------------------------------------------------------------------------------------------------------

Program 2:

input.txt
2012-01-01 09:00 San Jose Men's Clothing 214.05 Amex
2012-01-01 09:00 Fort Worth Women's Clothing 153.57 Visa
2012-01-01 09:00 San Diego Music66.08 Cash
:------------------------------------------------------------------- # Mapper.py
#!/usr/bin/python
# Format of each line is:
# date\ttime\tstore name\titem description\tcost\tmethod of payment
#
# We want elements 2 (store name) and 4 (cost)
# We need to write them out to standard output, separated by a tab
import sys
for line in sys.stdin:
 data = line.strip().split("\t")
 if len(data) == 6:
 date, time, store, item, cost, payment = data
 print ("{0}\t{1}".format(store, cost))
------------------------------------------------------------------------ #Reducer.py
#!/usr/bin/python
import sys
salesTotal = 0
oldKey = None
# Loop around the data.
# It will be in the format key \t val
# Where key is the store name, val is the sale amount
# All the sales for a particular store will be presented,
# then the key will change and we'll be dealing with the next store
for line in sys.stdin:
 data_mapped = line.strip().split("\t")
 if len(data_mapped) != 2:
 # Something has gone wrong. Skip this line.
 continue
 thisKey, thisSale = data_mapped
 if oldKey and oldKey != thisKey:
 print (oldKey, "\t", salesTotal)
 oldKey = thisKey;
 salesTotal = 0
 oldKey = thisKey
 salesTotal += float(thisSale)
if oldKey != None:
 print (oldKey, "\t", salesTotal)