-
Notifications
You must be signed in to change notification settings - Fork 23
/
tfidf_transformer.hpp
128 lines (116 loc) · 2.63 KB
/
tfidf_transformer.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#ifndef TFIDF_TRANSFORMER_HPP
#define TFIDF_TRANSFORMER_HPP
#include "util.hpp"
#include <cstdio>
class TFIDFTransformer
{
private:
typedef std::vector<float> idf_t;
idf_t m_idf;
float m_zero_idf;
public:
TFIDFTransformer(){}
void
train(const std::vector<fv_t> &tf)
{
static const float BETA = 5.0f;
double docs = (double)tf.size();
// word count
m_idf.clear();
for (auto doc = tf.begin(); doc != tf.end(); ++doc) {
for (auto word = doc->begin(); word != doc->end(); ++word) {
if (word->first >= (int)m_idf.size()) {
m_idf.resize(word->first + 1);
}
m_idf[word->first] += 1.0f;
}
}
// compute idf
for (auto idf = m_idf.begin(); idf != m_idf.end(); ++idf) {
*idf = BETA + std::log(docs / (*idf + 1.0f));
}
m_zero_idf = BETA + std::log(docs);
}
void
transform(fv_t &fv) const
{
float dot = 0.0f;
for (auto word = fv.begin(); word != fv.end(); ++word) {
float idf;
float tf = std::log(word->second + 1.0);// + 1.0;
if (word->first < (int)m_idf.size()) {
idf = m_idf[word->first];
} else {
idf = m_zero_idf;
}
word->second = tf * idf;
dot += word->second * word->second;
}
if (dot > 0.0f) {
// L2 normalize
float norm_scale = 1.0f / std::sqrt(dot);
for (auto word = fv.begin(); word != fv.end(); ++word) {
word->second *= norm_scale;
}
}
}
void
transform(std::vector<fv_t> &tf) const
{
#ifdef _OPENMP
#pragma omp parallel for
#endif
for (size_t i = 0; i < tf.size(); ++i) {
transform(tf[i]);
}
}
bool
save(const char *file) const
{
FILE *fp = std::fopen(file, "wb");
if (fp == 0) {
return false;
}
size_t size = m_idf.size();
std::fwrite(&size, sizeof(size), 1, fp);
std::fwrite(m_idf.data(), sizeof(float), size, fp);
std::fwrite(&m_zero_idf, sizeof(m_zero_idf), 1, fp);
fclose(fp);
return true;
}
bool
load(const char *file)
{
FILE *fp = std::fopen(file, "rb");
if (fp == 0) {
return false;
}
size_t size = 0;
size_t ret = std::fread(&size, sizeof(size), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 1\n", file);
fclose(fp);
return false;
}
m_idf.clear();
float *buffer = new float[size];
ret = fread(&buffer[0], sizeof(float), size, fp);
if (ret != size) {
std::fprintf(stderr, "%s: invalid format 2\n", file);
delete buffer;
fclose(fp);
return false;
}
std::copy(buffer, buffer + size, std::back_inserter(m_idf));
delete buffer;
ret = std::fread(&m_zero_idf, sizeof(m_zero_idf), 1, fp);
if (ret != 1) {
std::fprintf(stderr, "%s: invalid format 2\n", file);
fclose(fp);
return false;
}
fclose(fp);
return true;
}
};
#endif