Skip to content

Commit

Permalink
text-related keyword arguments
Browse files Browse the repository at this point in the history
  • Loading branch information
guoyongzhi authored and guo-yong-zhi committed May 8, 2024
1 parent cd9d95d commit 3e3e59f
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 21 deletions.
44 changes: 25 additions & 19 deletions src/textprocessing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ function tokenizer_eng(text::AbstractString, regexp=r"\w[\w']+")
[endswith(text[i], "'s") ? text[i][1:prevind(text[i], end, 2)] : text[i] for i in indices]
end

# ISO 639-3 macrolanguages
TOKENIZERS = Dict(
[
"_default_" => tokenizer,
Expand Down Expand Up @@ -86,16 +87,19 @@ function countwords(words::AbstractVector{<:AbstractString}; language=:auto,
end

function countwords(text::AbstractString; language=:auto, kargs...)
language == :auto && (language = detect_language(text))
language = detect_language(text, language)
if !haskey(TOKENIZERS, language)
@warn "No built-in tokenizer for $(language)!"
end
tokenizer_ = get(TOKENIZERS, language, TOKENIZERS["_default_"])
counter = countwords(tokenizer_(text); kargs...)
lemmatizer_ = get(LEMMATIZERS, language, LEMMATIZERS["_default_"])
lemmatizer_(counter)
end

raw"""
countwords(text; regexp=r"\w[\w']+", counter=Dict{String,Int}(), kargs...)
Count words in text. And use `regexp` to split. And save results into `counter`.
countwords(text; counter=Dict{String,Int}(), kargs...)
Count words in text. And save results into `counter`.
`text` can be a String, a Vector of String, or an opend file (IO).
"""
function countwords(textfile::IO; counter=Dict{String,Int}(), kargs...)
Expand Down Expand Up @@ -152,31 +156,30 @@ When p is 1, the power mean is the arithmetic mean. When p is 2, the power mean
"""
rescaleweights(func=identity, p=0) = dict -> _rescaleweights(dict, func, p)

function _detect_language(text)
language = langid(text)
println("Language: $language")
if !haskey(STOPWORDS, language)
println("No built-in stopwords for $(language)!")
end
if !haskey(TOKENIZERS, language)
println("No built-in tokenizer for $(language)!")
function _detect_language(text, language=:auto)
if language !== :auto
return StopWords.normcode(language)
else
language = langid(text)
println("Language: $language")
return StopWords.normcode(language)
end
return language
end
function detect_language(text)
_detect_language(text)
function detect_language(text, language=:auto)
_detect_language(text, language)
end
function detect_language(text::IO)
function detect_language(text::IO, language=:auto)
p = position(text)
l = _detect_language(text)
l = _detect_language(text, language)
seek(text, p)
return l
end
"""
Process the text, filter the words, and adjust the weights. Return a vector of words and a vector of weights.
## Positional Arguments
* text: string, a vector of words, an opened file (IO), a counter, a Dict{<:String, <:Real}, a Vector{Pair}, a Vector{Tuple}, or two Vectors.
* text: a string, a vector of words, an opened file (IO), a counter, a Dict{<:String, <:Real}, a Vector{Pair}, a Vector{Tuple}, or two Vectors.
## Optional Keyword Arguments
* language: language of the text, default is :auto.
* stopwords: a set of words
* minlength, maxlength: minimum and maximum length of a word to be included
* minfrequency: minimum frequency of a word to be included
Expand All @@ -193,7 +196,10 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
minweight=1 / maxnum, maxweight=:auto,
process=rescaleweights(identity, 0) casemerge!)

language == :auto && (language = detect_language(keys(counter)))
language = detect_language(keys(counter), language)
if !haskey(STOPWORDS, language)
@warn "No built-in stopwords for $(language)!"
end
stopwords == :auto && (stopwords = get(STOPWORDS, language, Set()))
stopwords isa AbstractSet || (stopwords = Set(stopwords))
counter = process(counter)
Expand Down Expand Up @@ -231,7 +237,7 @@ function processtext(counter::AbstractDict{<:AbstractString,<:Real};
end

function processtext(text; language=:auto, kargs...)
language == :auto && (language = detect_language(text))
language = detect_language(text, language)
cwkw = (:counter, :regexp)
processtext(
countwords(text; language=language, filter(kw -> first(kw) cwkw, kargs)...);
Expand Down
9 changes: 8 additions & 1 deletion src/wc-class.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ The positional arguments are used to specify words and weights in various forms,
* counter::AbstractDict
* counter::AbstractVector{<:Pair}
## Optional Keyword Arguments
### text-related keyword arguments
* language: language of the text, default is :auto.
* stopwords: a set of words
* maxnum: maximum number of words, default is 500
### style-related keyword arguments
* colors = "black" # same color for all words
* colors = ("black", (0.5,0.5,0.7), "yellow", "#ff0000", 0.2) # entries are randomly chosen
Expand Down Expand Up @@ -65,7 +70,9 @@ The positional arguments are used to specify words and weights in various forms,
wordcloud(wordsweights::Tuple; kargs...) = wordcloud(wordsweights...; kargs...)
wordcloud(counter::AbstractDict; kargs...) = wordcloud(keys(counter) |> collect, values(counter) |> collect; kargs...)
wordcloud(counter::AbstractVector{<:Union{Pair,Tuple,AbstractVector}}; kargs...) = wordcloud(first.(counter), [v[2] for v in counter]; kargs...)
wordcloud(text; kargs...) = wordcloud(processtext(text); kargs...)
function wordcloud(text; language=:auto, stopwords=:auto, maxnum=500, kargs...)
wordcloud(processtext(text, language=language, stopwords=stopwords, maxnum=maxnum); kargs...)
end
wordcloud(words, weight::Number; kargs...) = wordcloud(words, repeat([weight], length(words)); kargs...)
function wordcloud(words::AbstractVector{<:AbstractString}, weights::AbstractVector{<:Real};
colors=:auto, angles=:auto,
Expand Down
9 changes: 8 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,16 @@ include("test_textprocessing.jl")
@test all(bg[mask] .== s[mask])
@test all(bg[.!mask] .!= s[.!mask])

# language
l1 = length(wordcloud("It's easy to generate word clouds"))
for lang in ["en", "eng", "English"]
l2 = length(wordcloud("It's easy to generate word clouds", language=lang))
@test l1 == l2
end

# overall
wordcloud(["中文", "需要", "提前", "分词"], fonts="") |> generate!
wordcloud(["the"=>1.0, "to"=>0.51, "and"=>0.50,
"of"=>0.47, "a"=>0.44, "in"=>0.33]) |> generate!
wordcloud("It's easy to generate word clouds") |> generate!
wordcloud("It's easy to generate word clouds", maxnum=10) |> generate!
end

0 comments on commit 3e3e59f

Please sign in to comment.