Skip to content

Commit

Permalink
renyi entropy added
Browse files Browse the repository at this point in the history
  • Loading branch information
maximtrp committed Aug 19, 2021
1 parent 5206207 commit 70ae6d7
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 5 deletions.
3 changes: 2 additions & 1 deletion docs/source/bitermplus.metrics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ Metrics
.. currentmodule:: bitermplus

.. autofunction:: coherence
.. autofunction:: perplexity
.. autofunction:: perplexity
.. autofunction:: entropy
6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from setuptools import setup, Extension
from Cython.Build import cythonize
from platform import system
from Cython.Build import cythonize
# from numpy import get_include

extra_link_args = ['-lomp'] if system() == 'Darwin' else ['-fopenmp']
Expand All @@ -12,12 +12,12 @@
Extension(
"bitermplus._btm",
sources=["src/bitermplus/_btm.pyx"],
# include_dirs=[get_include()],
# library_dirs=[get_include()],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args),
Extension(
"bitermplus._metrics",
# include_dirs=[get_include()],
# library_dirs=[get_include()],
sources=["src/bitermplus/_metrics.pyx"],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args),
Expand Down
83 changes: 82 additions & 1 deletion src/bitermplus/_metrics.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
__all__ = ['perplexity', 'coherence']
__all__ = ['perplexity', 'coherence', 'entropy']

from cython.view cimport array
from libc.math cimport exp, log
from typing import Union
from pandas import DataFrame
Expand Down Expand Up @@ -160,3 +161,83 @@ cpdef coherence(
coherence[t] = logSum

return np.array(coherence)


@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
cpdef entropy(
double[:, :] p_wz):
"""Renyi entropy calculation routine [1]_.
Renyi entropy can be used to estimate the optimal number of topics.
Parameters
----------
p_wz : np.ndarray
Topics vs words probabilities matrix (T x W).
Returns
-------
renyi : double
Renyi entropy value.
References
----------
.. [1] Koltcov, S. (2018). Application of Rényi and Tsallis entropies to
topic modeling optimization. Physica A: Statistical Mechanics and its
Applications, 512, 1192-1204.
"""
# Words number
cdef int W = p_wz.shape[1]
# Topics number
cdef int T = p_wz.shape[0]

# Initializing variables
cdef double word_ratio = 0.
cdef double sum_prob = 0.
cdef double shannon = 0.
cdef double energy = 0.
cdef double int_energy = 0.
cdef double free_energy = 0.
cdef double renyi = 0.
cdef double thresh = 1.
cdef int t = 0
cdef int w = 0

# Setting threshold
thresh /= W

# Maximum probability of each word
cdef double[:] p_max = array(
shape=(W, ), itemsize=sizeof(double), format="d",
allocate_buffer=True)
p_max[...] = 0.

for w in range(W):
for t in range(T):
if p_wz[t, w] > p_max[w]:
p_max[w] = p_wz[t, w]

# Select the probabilities larger than thresh
for w in range(W):
if p_max[w] > thresh:
sum_prob += p_max[w]
word_ratio += 1

# Shannon entropy
shannon = log(word_ratio / (W * T))

# Internal energy
int_energy = -log(sum_prob / T)

# Free energy
free_energy = int_energy - shannon * T

# Renyi entropy
if T == 1:
renyi = free_energy / T
else:
renyi = free_energy / (T-1)

return renyi
6 changes: 6 additions & 0 deletions tests/test_btm.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ def test_btm_class(self):
self.assertGreater(coherence.shape[0], 0)
LOGGER.info('Coherence testing finished')

LOGGER.info('Entropy testing started')
entropy = btm.entropy(model.matrix_topics_words_)
self.assertNotEqual(entropy, 0)
LOGGER.info("Entropy value: {}".format(entropy))
LOGGER.info('Entropy testing finished')

LOGGER.info('Model loading started')
with open('model.pickle', 'rb') as file:
self.assertIsInstance(pkl.load(file), btm._btm.BTM)
Expand Down

0 comments on commit 70ae6d7

Please sign in to comment.