Skip to content

Commit

Permalink
국가별 Q&A 게시글 MapReduce 구현
Browse files Browse the repository at this point in the history
(단어, 형태소)가 key값
  • Loading branch information
stophwan committed Dec 8, 2022
1 parent 7c8e3e0 commit e1bd3d7
Show file tree
Hide file tree
Showing 2 changed files with 3,104 additions and 0 deletions.
37 changes: 37 additions & 0 deletions mapreduce/Country_MapReduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sys
import re
from konlpy.tag import Okt
from hanspell import spell_checker
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import RawProtocol

class CountryCount(MRJob):
OUTPUT_PROTOCOL = RawProtocol
idx = 0
def steps(self):
return [
MRStep(mapper = self.map_country_count,
reducer = self.reduce_country_count)
]
def map_country_count(self,_,line):
content = line.strip()
con = re.compile('[가-힣]+').findall(content)
content = ''
for s in con:
content = content + s + ' '
han_spell = spell_checker.check(content)
if han_spell.result:
content = han_spell.checked
okt = Okt()
content = okt.pos(content)
for i in content:
key = i[0] + ',' + i[1]
yield (key,1)
def reduce_country_count(self,key,values):
CountryCount.idx += 1
output = str(CountryCount.idx)+','+key+','+str(sum(values))
yield None, output

if __name__ == '__main__':
CountryCount.run()
Loading

0 comments on commit e1bd3d7

Please sign in to comment.