From cf0d268d7da59fa6a1d8dc440eaaff057c505019 Mon Sep 17 00:00:00 2001 From: "yiruan@ebay.com" Date: Fri, 13 Sep 2019 14:55:12 +0800 Subject: [PATCH] enhancement for stackoverflow survey --- .../stackoverflow/README.md" | 171 ++++++++++++++++++ .../logstash-stackoverflow-survey.conf" | 32 ++++ .../stackoverflow/logstash-survay.conf" | 27 --- .../stackoverflow/reindex.json" | 4 +- .../stackoverflow/survey-mapping.json" | 15 ++ 5 files changed, 220 insertions(+), 29 deletions(-) create mode 100644 "part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-stackoverflow-survey.conf" delete mode 100644 "part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-survay.conf" create mode 100644 "part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/survey-mapping.json" diff --git "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/README.md" "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/README.md" index d4a0445..d99d26a 100644 --- "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/README.md" +++ "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/README.md" @@ -1,2 +1,173 @@ +# stackoverflow 用户调查问卷分析 +## 课程demo +``` +sudo bin/logstash -f ./logstash-stackoverflow-survey.conf + +PUT final-stackoverflow-survey +{ + "mappings": { + "dynamic_templates": [ + { + "strings_as_keywords": { + "match_mapping_type": "string", + "mapping": { + "type": "keyword" + } + } + } + ] + } +} + + + +数字相关 +YearsCode +WorkWeekHrs +Age +Age1stCode 16 +YearsCodePro + + + +PUT _ingest/pipeline/stackoverflow_pipeline +{ + "description": "Pipeline for stackoverflow survey", + "processors": [ + { + "split": { + "field": "DatabaseDesireNextYear", + "separator": ";" + } + }, + { + "split": { + "field": "DatabaseWorkedWith", + "separator": ";" + } + }, + { + "split": { + "field": "DevEnviron", + "separator": ";" + } + }, + { + "split": { + "field": "LanguageWorkedWith", + "separator": ";" + } + }, + { + "split": { + "field": "MiscTechDesireNextYear", + "separator": ";" + } + }, + { + "split": { + "field": "PlatformWorkedWith", + "separator": ";" + } + }, + { + "split": { + "field": "PlatformDesireNextYear", + "separator": ";" + } + }, + { + "split": { + "field": "WebFrameWorkedWith", + "separator": ";" + } + }, + { + "split": { + "field": "WebFrameDesireNextYear", + "separator": ";" + } + }, + { + "split": { + "field": "Containers", + "separator": ";" + } + }, + { + "script": { + "source": """ + try{ + ctx.YearsCode = Integer.parseInt(ctx.YearsCode); + }catch(Exception e){ + ctx.YearsCode = 0; + } +""" + } + }, + { + "script": { + "source": """ + try{ + ctx.WorkWeekHrs = Integer.parseInt(ctx.WorkWeekHrs); + }catch(Exception e){ + ctx.WorkWeekHrs = 0; + } +""" + } + }, + { + "script": { + "source": """ + try{ + ctx.Age = Integer.parseInt(ctx.Age); + }catch(Exception e){ + ctx.Age = 0; + } +""" + } + }, + { + "script": { + "source": """ + try{ + ctx.Age1stCode = Integer.parseInt(ctx.Age1stCode); + }catch(Exception e){ + ctx.Age1stCode = 0; + } +""" + } + }, + { + "script": { + "source": """ + try{ + ctx.YearsCodePro = Integer.parseInt(ctx.YearsCodePro); + }catch(Exception e){ + ctx.YearsCodePro = 0; + } +""" + } + } + ] +} + + + +POST _reindex?wait_for_completion=false +{ + "source": { + "index": "stackoverflow-survey-raw" + }, + "dest": { + "index": "final-stackoverflow-survey", + "pipeline": "stackoverflow_pipeline" + } +} + +GET final-stackoverflow-survey/_mapping + +``` +## 参考链接 http://stackoverflow.com/research/
 diff --git "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-stackoverflow-survey.conf" "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-stackoverflow-survey.conf" new file mode 100644 index 0000000..fa010bb --- /dev/null +++ "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-stackoverflow-survey.conf" @@ -0,0 +1,32 @@ +input { + file { + path => "/Users/yiruan/geektime/logstash-7.3.2/survey_results_public.csv" + start_position => "beginning" + sincedb_path => "/dev/null" + } +} + +filter { + csv { + autogenerate_column_names => false + skip_empty_columns => true + + columns => [ + "Respondent","MainBranch","Hobbyist","OpenSourcer","OpenSource","Employment","Country","Student","EdLevel","UndergradMajor","EduOther","OrgSize","DevType","YearsCode","Age1stCode","YearsCodePro","CareerSat","JobSat","MgrIdiot","MgrMoney","MgrWant","JobSeek","LastHireDate","LastInt","FizzBuzz","JobFactors","ResumeUpdate","CurrencySymbol","CurrencyDesc","CompTotal","CompFreq","ConvertedComp","WorkWeekHrs","WorkPlan","WorkChallenge","WorkRemote","WorkLoc","ImpSyn","CodeRev","CodeRevHrs","UnitTests","PurchaseHow","PurchaseWhat","LanguageWorkedWith","LanguageDesireNextYear","DatabaseWorkedWith","DatabaseDesireNextYear","PlatformWorkedWith","PlatformDesireNextYear","WebFrameWorkedWith","WebFrameDesireNextYear","MiscTechWorkedWith","MiscTechDesireNextYear","DevEnviron","OpSys","Containers","BlockchainOrg","BlockchainIs","BetterLife","ITperson","OffOn","SocialMedia","Extraversion","ScreenName","SOVisit1st","SOVisitFreq","SOVisitTo","SOFindAnswer","SOTimeSaved","SOHowMuchTime","SOAccount","SOPartFreq","SOJobs","EntTeams","SOComm","WelcomeChange","SONewContent","Age","Gender","Trans,Sexuality","Ethnicity","Dependents","SurveyLength","SurveyEase" + ] + + } + if ([collector] == "collector") { + drop {} + } + mutate { remove_field => ["message", "@version", "@timestamp", "host"] } +} +output { + stdout { codec => "dots" } + elasticsearch { + hosts => ["http://localhost:9200"] + index => "stackoverflow-survey-raw" + document_type => "_doc" + } +} + diff --git "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-survay.conf" "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-survay.conf" deleted file mode 100644 index b54f189..0000000 --- "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/logstash-survay.conf" +++ /dev/null @@ -1,27 +0,0 @@ -input { - file { - path => ["/Users/yiruan/geektime/logstash-7.1.0/survey_results_public.csv"] - start_position => "beginning" - sincedb_path => "/dev/null" - } -} -filter { - csv { - separator => "," - autodetect_column_names => true - autogenerate_column_names => true - } - - } - - -output { - elasticsearch { - document_type => "_doc" - index => "stackoverflowraw" - hosts => ["http://localhost:9200"] - } - stdout{ - codec => rubydebug - } -} diff --git "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/reindex.json" "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/reindex.json" index f318db1..0117fbe 100644 --- "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/reindex.json" +++ "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/reindex.json" @@ -1,10 +1,10 @@ POST _reindex?wait_for_completion=false { "source": { - "index": "stackoverflow-surveys-2019" + "index": "stackoverflow-survey-raw" }, "dest": { - "index": "stackoverflow-surveys-results", + "index": "final-stackoverflow-survey", "pipeline": "stackoverflow_pipeline" } } \ No newline at end of file diff --git "a/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/survey-mapping.json" "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/survey-mapping.json" new file mode 100644 index 0000000..757adb9 --- /dev/null +++ "b/part-5/\345\256\236\346\210\2302-stackoverflow\347\224\250\346\210\267\350\260\203\346\237\245\351\227\256\345\215\267\345\210\206\346\236\220/stackoverflow/survey-mapping.json" @@ -0,0 +1,15 @@ +PUT final-stackoverflow-survey +{ + "mappings": { + "dynamic_templates": [ + { + "strings_as_keywords": { + "match_mapping_type": "string", + "mapping": { + "type": "keyword" + } + } + } + ] + } +}