Skip to content

Commit

Permalink
major bugs fixed in data initialization, normalization, and random nu…
Browse files Browse the repository at this point in the history
…mber generation
  • Loading branch information
maximtrp committed Apr 4, 2021
1 parent 3efa119 commit 6ae2fc1
Show file tree
Hide file tree
Showing 6 changed files with 204 additions and 129 deletions.
18 changes: 10 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,19 +53,21 @@ pip3 install bitermplus
```python
import bitermplus as btm
import numpy as np
from gzip import open as gzip_open
import pandas as pd

# Importing and vectorizing text data
with gzip_open('dataset/SearchSnippets.txt.gz', 'rb') as file:
texts = file.readlines()
# Importing data
df = pd.read_csv(
'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
texts = df['texts'].str.strip().tolist()

# Vectorizing documents, obtaining full vocabulary and biterms
X, vocab = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(X)
biterms = btm.get_biterms(X)
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)

# Initializing and running model
model = btm.BTM(X, vocab, T=8, W=vocab.size, M=20, alpha=50/8, beta=0.01)
model = btm.BTM(
X, vocabulary, seed=12321, T=8, W=vocabulary.size, M=20, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=20)
p_zd = model.transform(docs_vec)

Expand Down
18 changes: 10 additions & 8 deletions docs/source/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,21 @@ Here is a simple example of package usage:
import bitermplus as btm
import numpy as np
from gzip import open as gzip_open
import pandas as pd
# Importing and vectorizing text data
with gzip_open('dataset/SearchSnippets.txt.gz', 'rb') as file:
texts = file.readlines()
# Importing data
df = pd.read_csv(
'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
texts = df['texts'].str.strip().tolist()
# Vectorizing documents, obtaining full vocabulary and biterms
X, vocab = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(X)
biterms = btm.get_biterms(X)
X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
biterms = btm.get_biterms(docs_vec)
# Initializing and running model
model = btm.BTM(X, vocab, T=8, W=vocab.size, M=20, alpha=50/8, beta=0.01)
model = btm.BTM(
X, vocabulary, seed=12321, T=8, W=vocabulary.size, M=20, alpha=50/8, beta=0.01)
model.fit(biterms, iterations=20)
p_zd = model.transform(docs_vec)
Expand Down
2 changes: 1 addition & 1 deletion src/bitermplus/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = '0.5.10'
__version__ = '0.6.0'

from bitermplus._btm import BTM
from bitermplus._util import *
Expand Down
Loading

0 comments on commit 6ae2fc1

Please sign in to comment.