Skip to content

Commit

Permalink
Merge pull request #97 from JuliaText/as/towards07
Browse files Browse the repository at this point in the history
Prepare for 1.0
  • Loading branch information
aviks authored Oct 1, 2018
2 parents 58057e9 + dbf5bed commit e835044
Show file tree
Hide file tree
Showing 21 changed files with 143 additions and 136 deletions.
6 changes: 2 additions & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@ language: julia
os:
- linux
julia:
- 0.6
- 0.7
- 1.0
notifications:
email: false
script:
- if [[ -a .git/shallow ]]; then git fetch --unshallow; fi
- julia -e 'Pkg.clone(pwd()); Pkg.build("TextAnalysis"); Pkg.test("TextAnalysis"; coverage=true)';
after_success:
- julia -e 'cd(Pkg.dir("TextAnalysis")); Pkg.add("Coverage"); using Coverage; Coveralls.submit(Coveralls.process_folder())';
4 changes: 2 additions & 2 deletions REQUIRE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
julia 0.6
julia 0.7
BinaryProvider
Languages 0.2.0
Languages 0.4.0
DataFrames
WordTokenizers
Flux
Expand Down
45 changes: 24 additions & 21 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
environment:
matrix:
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x86/0.6/julia-0.6-latest-win32.exe"
- JULIA_URL: "https://julialang-s3.julialang.org/bin/winnt/x64/0.6/julia-0.6-latest-win64.exe"
# - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x86/julia-latest-win32.exe"
# - JULIA_URL: "https://julialangnightlies-s3.julialang.org/bin/winnt/x64/julia-latest-win64.exe"
- julia_version: 0.7
- julia_version: 1
- julia_version: nightly

platform:
- x86 # 32-bit
- x64 # 64-bit

# # Uncomment the following lines to allow failures on nightly julia
# # (tests will run but not make your overall status red)
# matrix:
allow_failures:
- julia_version: nightly

branches:
only:
Expand All @@ -17,24 +26,18 @@ notifications:
on_build_status_changed: false

install:
- ps: "[System.Net.ServicePointManager]::SecurityProtocol = [System.Net.SecurityProtocolType]::Tls12"
# If there's a newer build queued for the same PR, cancel this one
- ps: if ($env:APPVEYOR_PULL_REQUEST_NUMBER -and $env:APPVEYOR_BUILD_NUMBER -ne ((Invoke-RestMethod `
https://ci.appveyor.com/api/projects/$env:APPVEYOR_ACCOUNT_NAME/$env:APPVEYOR_PROJECT_SLUG/history?recordsNumber=50).builds | `
Where-Object pullRequestId -eq $env:APPVEYOR_PULL_REQUEST_NUMBER)[0].buildNumber) { `
throw "There are newer queued builds for this pull request, failing early." }
# Download most recent Julia Windows binary
- ps: (new-object net.webclient).DownloadFile(
$env:JULIA_URL,
"C:\projects\julia-binary.exe")
# Run installer silently, output to C:\projects\julia
- C:\projects\julia-binary.exe /S /D=C:\projects\julia
- ps: iex ((new-object net.webclient).DownloadString("https://raw.githubusercontent.com/JuliaCI/Appveyor.jl/version-1/bin/install.ps1"))

build_script:
# Need to convert from shallow to complete for Pkg.clone to work
- IF EXIST .git\shallow (git fetch --unshallow)
- C:\projects\julia\bin\julia -e "versioninfo();
Pkg.clone(pwd(), \"TextAnalysis\"); Pkg.build(\"TextAnalysis\")"
- echo "%JL_BUILD_SCRIPT%"
- C:\julia\bin\julia -e "%JL_BUILD_SCRIPT%"

test_script:
- C:\projects\julia\bin\julia -e "Pkg.test(\"TextAnalysis\")"
- echo "%JL_TEST_SCRIPT%"
- C:\julia\bin\julia -e "%JL_TEST_SCRIPT%"

# # Uncomment to support code coverage upload. Should only be enabled for packages
# # which would have coverage gaps without running on Windows
# on_success:
# - echo "%JL_CODECOV_SCRIPT%"
# - C:\julia\bin\julia -e "%JL_CODECOV_SCRIPT%"
10 changes: 5 additions & 5 deletions docs/push-gh-pages.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

last_commit=readchomp(`git --no-pager log -1 --pretty=format:"%h:%s"`)

ENV["GIT_DIR"]=abspath(chomp(readstring(`git rev-parse --git-dir`)))
ENV["GIT_DIR"]=abspath(chomp(read(`git rev-parse --git-dir`, String)))

old_sha = chomp(readstring(`git rev-parse refs/remotes/origin/gh-pages`))
old_sha = chomp(read(`git rev-parse refs/remotes/origin/gh-pages`, String))

#run(`julia make.jl`)

Expand All @@ -16,13 +16,13 @@ cd("build") do
ENV["GIT_INDEX_FILE"]=gif
ENV["GIT_WORK_TREE"]=pwd()
run(`git add -A`)
tsha=chomp(readstring(`git write-tree`))
tsha=chomp(read(`git write-tree`, String))
mesg="Deploy docs for master@$last_commit"

if length(old_sha) == 40
csha = chomp(readstring(`git commit-tree $tsha -p $old_sha -m $(mesg)`))
csha = chomp(read(`git commit-tree $tsha -p $old_sha -m $(mesg)`, String))
else
csha = chomp(readstring(`git commit-tree $tsha -m $(mesg)`))
csha = chomp(read(`git commit-tree $tsha -m $(mesg)`, String))
end

print("Created commit $csha")
Expand Down
4 changes: 4 additions & 0 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
using DataFrames

module TextAnalysis
using SparseArrays
using Printf
using LinearAlgebra

using Languages
using DataFrames
using WordTokenizers
Expand Down
19 changes: 10 additions & 9 deletions src/corpus.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ function DirectoryCorpus(dirname::AbstractString)

cd(dirname)
for filename in readdir(".")
if isfile(filename) && !ismatch(r"^\.", filename)
if isfile(filename) && !occursin(r"^\.", filename)
push!(docs, FileDocument(abspath(filename)))
end
if isdir(filename) && !islink(filename)
Expand Down Expand Up @@ -102,9 +102,10 @@ end
#
##############################################################################

Base.start(crps::Corpus) = 1
Base.next(crps::Corpus, ind::Int) = (crps.documents[ind], ind + 1)
Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents)
function Base.iterate(crps::Corpus, ind=1)
ind > length(crps.documents) && return nothing
crps.documents[ind], ind+1
end

##############################################################################
#
Expand All @@ -115,8 +116,8 @@ Base.done(crps::Corpus, ind::Int) = ind > length(crps.documents)
Base.push!(crps::Corpus, d::AbstractDocument) = push!(crps.documents, d)
Base.pop!(crps::Corpus) = pop!(crps.documents)

Base.unshift!(crps::Corpus, d::AbstractDocument) = unshift!(crps.documents, d)
Base.shift!(crps::Corpus) = shift!(crps.documents)
Base.pushfirst!(crps::Corpus, d::AbstractDocument) = pushfirst!(crps.documents, d)
Base.popfirst!(crps::Corpus) = popfirst!(crps.documents)

function Base.insert!(crps::Corpus, index::Int, d::AbstractDocument)
insert!(crps.documents, index, d)
Expand All @@ -133,8 +134,8 @@ Base.delete!(crps::Corpus, index::Integer) = delete!(crps.documents, index)
##############################################################################

Base.getindex(crps::Corpus, ind::Real) = crps.documents[ind]
Base.getindex{T <: Real}(crps::Corpus, inds::Vector{T}) = crps.documents[inds]
Base.getindex(crps::Corpus, r::Range) = crps.documents[r]
Base.getindex(crps::Corpus, inds::Vector{T}) where {T <: Real} = crps.documents[inds]
Base.getindex(crps::Corpus, r::AbstractRange) = crps.documents[r]
Base.getindex(crps::Corpus, term::AbstractString) = get(crps.inverse_index, term, Int[])

##############################################################################
Expand Down Expand Up @@ -226,7 +227,7 @@ hash_function!(crps::Corpus, f::TextHashFunction) = (crps.h = f; nothing)
#
##############################################################################

function standardize!{T <: AbstractDocument}(crps::Corpus, ::Type{T})
function standardize!(crps::Corpus, ::Type{T}) where T <: AbstractDocument
for i in 1:length(crps)
crps.documents[i] = convert(T, crps.documents[i])
end
Expand Down
8 changes: 4 additions & 4 deletions src/deprecations.jl
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@

## Deprecations for Languages

function tokenize{S <: Language, T <: AbstractString}(::Type{S}, s::T)
function tokenize(::Type{S}, s::T) where {S <: Language, T <: AbstractString}
depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S))
tokenize(S(), s)
end

function ngramize{S <: Language, T <: AbstractString}(::Type{S}, words::Vector{T}, n::Int)
function ngramize(::Type{S}, words::Vector{T}, n::Int) where {S <: Language, T <: AbstractString}
depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S))
ngramize(S(), words, n)
end

function onegramize{S <: Language, T <: AbstractString}(::Type{S}, words::Vector{T})
function onegramize(::Type{S}, words::Vector{T}) where {S <: Language, T <: AbstractString}
depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S))
onegramize(S(), words)
end

function stem_all{S <: Language}(stemmer::Stemmer, lang::Type{S}, sentence::AbstractString)
function stem_all(stemmer::Stemmer, lang::Type{S}, sentence::AbstractString) where S <: Language
depwarn("Use of Languages as types is deprecated. Use instances.", Symbol(S))
stem_all(stemmer, S(), sentence)
end
Expand Down
24 changes: 12 additions & 12 deletions src/document.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
##############################################################################

type DocumentMetadata
mutable struct DocumentMetadata
language
name::String
author::String
Expand All @@ -31,7 +31,7 @@ abstract type AbstractDocument; end
#
##############################################################################

type FileDocument <: AbstractDocument
mutable struct FileDocument <: AbstractDocument
filename::String
metadata::DocumentMetadata
end
Expand All @@ -48,7 +48,7 @@ end
#
##############################################################################

type StringDocument{T<:AbstractString} <: AbstractDocument
mutable struct StringDocument{T<:AbstractString} <: AbstractDocument
text::T
metadata::DocumentMetadata
end
Expand All @@ -61,14 +61,14 @@ StringDocument(txt::AbstractString) = StringDocument(txt, DocumentMetadata())
#
##############################################################################

type TokenDocument{T<:AbstractString} <: AbstractDocument
mutable struct TokenDocument{T<:AbstractString} <: AbstractDocument
tokens::Vector{T}
metadata::DocumentMetadata
end
function TokenDocument(txt::AbstractString, dm::DocumentMetadata)
TokenDocument(tokenize(dm.language, String(txt)), dm)
end
function TokenDocument{T <: AbstractString}(tkns::Vector{T})
function TokenDocument(tkns::Vector{T}) where T <: AbstractString
TokenDocument(tkns, DocumentMetadata())
end
TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata())
Expand All @@ -79,7 +79,7 @@ TokenDocument(txt::AbstractString) = TokenDocument(String(txt), DocumentMetadata
#
##############################################################################

type NGramDocument{T<:AbstractString} <: AbstractDocument
mutable struct NGramDocument{T<:AbstractString} <: AbstractDocument
ngrams::Dict{T,Int}
n::Int
metadata::DocumentMetadata
Expand All @@ -91,7 +91,7 @@ end
function NGramDocument(txt::AbstractString, n::Integer=1)
NGramDocument(txt, DocumentMetadata(), n)
end
function NGramDocument{T <: AbstractString}(ng::Dict{T, Int}, n::Integer=1)
function NGramDocument(ng::Dict{T, Int}, n::Integer=1) where T <: AbstractString
NGramDocument(merge(Dict{AbstractString,Int}(), ng), n, DocumentMetadata())
end

Expand All @@ -103,12 +103,12 @@ end

function text(fd::FileDocument)
!isfile(fd.filename) && error("Can't find file: $(fd.filename)")
readstring(fd.filename)
read(fd.filename, String)
end

text(sd::StringDocument) = sd.text
function text(td::TokenDocument)
warn("TokenDocument's can only approximate the original text")
@warn("TokenDocument's can only approximate the original text")
join(td.tokens, " ")
end
function text(ngd::NGramDocument)
Expand All @@ -132,8 +132,8 @@ function tokens(d::NGramDocument)
error("The tokens of an NGramDocument cannot be reconstructed")
end

tokens!{T <: AbstractString}(d::TokenDocument, new_tokens::Vector{T}) = (d.tokens = new_tokens)
function tokens!{T <: AbstractString}(d::AbstractDocument, new_tokens::Vector{T})
tokens!(d::TokenDocument, new_tokens::Vector{T}) where {T <: AbstractString} = (d.tokens = new_tokens)
function tokens!(d::AbstractDocument, new_tokens::Vector{T}) where T <: AbstractString
error("The tokens of a $(typeof(d)) cannot be directly edited")
end

Expand Down Expand Up @@ -199,7 +199,7 @@ const GenericDocument = Union{
##############################################################################

Document(str::AbstractString) = isfile(str) ? FileDocument(str) : StringDocument(str)
Document{T <: AbstractString}(tkns::Vector{T}) = TokenDocument(tkns)
Document(tkns::Vector{T}) where {T <: AbstractString} = TokenDocument(tkns)
Document(ng::Dict{String, Int}) = NGramDocument(ng)

##############################################################################
Expand Down
20 changes: 10 additions & 10 deletions src/dtm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
##############################################################################

type DocumentTermMatrix
mutable struct DocumentTermMatrix
dtm::SparseMatrixCSC{Int, Int}
terms::Vector{String}
column_indices::Dict{String, Int}
Expand Down Expand Up @@ -32,9 +32,9 @@ function DocumentTermMatrix(crps::Corpus, terms::Vector{String})
m = length(crps)
n = length(terms)

rows = Array{Int}(0)
columns = Array{Int}(0)
values = Array{Int}(0)
rows = Array{Int}(undef, 0)
columns = Array{Int}(undef, 0)
values = Array{Int}(undef, 0)
for i in 1:m
doc = crps.documents[i]
ngs = ngrams(doc)
Expand All @@ -57,7 +57,7 @@ function DocumentTermMatrix(crps::Corpus, terms::Vector{String})
end
DocumentTermMatrix(crps::Corpus) = DocumentTermMatrix(crps, lexicon(crps))

DocumentTermMatrix(crps::Corpus, lex::Associative) = DocumentTermMatrix(crps, sort(collect(keys(lex))))
DocumentTermMatrix(crps::Corpus, lex::AbstractDict) = DocumentTermMatrix(crps, sort(collect(keys(lex))))

DocumentTermMatrix(dtm::SparseMatrixCSC{Int, Int},terms::Vector{String}) = DocumentTermMatrix(dtm, terms, columnindices(terms))

Expand All @@ -71,7 +71,7 @@ function dtm(d::DocumentTermMatrix, density::Symbol)
if density == :sparse
return d.dtm
else
return full(d.dtm)
return Matrix(d.dtm)
end
end

Expand Down Expand Up @@ -99,8 +99,8 @@ tdm(crps::Corpus) = dtm(crps)' #'

function dtm_entries(d::AbstractDocument, lex::Dict{String, Int})
ngs = ngrams(d)
indices = Array{Int}(0)
values = Array{Int}(0)
indices = Array{Int}(undef, 0)
values = Array{Int}(undef, 0)
terms = sort(collect(keys(lex)))
column_indices = columnindices(terms)

Expand Down Expand Up @@ -166,7 +166,7 @@ hash_tdm(crps::Corpus) = hash_dtm(crps)' #'
#
##############################################################################

type EachDTV
mutable struct EachDTV
crps::Corpus
end

Expand All @@ -178,7 +178,7 @@ end

done(edt::EachDTV, state::Int) = state > length(edt.crps.documents)

type EachHashDTV
mutable struct EachHashDTV
crps::Corpus
end

Expand Down
2 changes: 1 addition & 1 deletion src/hash.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#
##############################################################################

type TextHashFunction
mutable struct TextHashFunction
hash_function::Function
cardinality::Int
end
Expand Down
Loading

0 comments on commit e835044

Please sign in to comment.