forked from arc12/Text-Mining-Weak-Signals
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stem Helper.R
42 lines (34 loc) · 1.98 KB
/
Stem Helper.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
##
## simple helper to find the stemmed forms of candidate words and also to determine the shortes form in a corpus
##
library("tm")
#where to find a corpus
load("/home/arc1/R Projects/Text Mining Weak Signals Output/Rising and Falling Terms/Union C/2011/Corpus.RData")
#space-separated list of words to stem
#look.for<-"LMS VLE LCMS E-Portfolio Games Gesture Metadata Adaptive Open Social Ubiquitous Semantic Agents Cloud Broadband Video"
look.for<-"cloud virtualisation virtualization saas paas"
look.for<-"ebook etextbooks"
look.for<-"analytics analytical analysis data"
look.for<-"gesturebased gestural"
look.for<-"context contextsensitive contextaware contextenriched location locationbased locationaware geospatial"
look.for<-"Game gaming gamification game-based game-play"
look.for<-"Immersive Standards OSS Blog Twitter Wiki Tablet Smartphone Mobile Streaming"
#look.for<-tolower(removePunctuation(look.for))
look.for.vec<- unlist(strsplit(look.for," "))
look.for.stems<-stemDocument(tolower(removePunctuation(look.for.vec)))
#now lookup shortest and prevalent forms leading to stem within the corpus
shortest.words<-stemCompletion(look.for.stems,corp,type="shortest")
completion.fails<-is.na(shortest.words) | shortest.words==""
shortest.words[completion.fails]<-names(shortest.words[completion.fails])
prevalent.words<-stemCompletion(look.for.stems,corp,type="prevalent")
completion.fails<-is.na(prevalent.words) | prevalent.words==""
prevalent.words[completion.fails]<-names(prevalent.words[completion.fails])
#some output for easy cut/paste
print("Looked-for words:")
print(paste("c('",paste(look.for.vec,collapse="','"),"')", sep=""))
print("Stemmed words:")
print(paste("c('",paste(look.for.stems,collapse="','"),"')", sep=""))
print("Shortest words in the given corpus to match these stems:")
print(paste("c('",paste(shortest.words,collapse="','"),"')", sep=""))
print("Prevalent words in the given corpus to match these stems:")
print(paste("c('",paste(prevalent.words,collapse="','"),"')", sep=""))