diff --git a/Model/lsa_Similar.py b/Model/lsa_Similar.py index e2b1f35..10603fa 100644 --- a/Model/lsa_Similar.py +++ b/Model/lsa_Similar.py @@ -7,27 +7,20 @@ tfidf_vectorizer = TfidfVectorizer() komoran = Komoran('STABLE') +def preprocess_text(text): + tokens = (komoran.get_plain_text(text)).split(' ') + words = [token.split('/')[0] for token in tokens] + return ' '.join(words) + def lsa_Similar(contents, answer): - test = (komoran.get_plain_text(contents[0])).split(' ') - for j in range(len(test)): - temp = test[j].split('/') - test[j] = temp[0] - #print('여기',test) - test = ' '.join(test) - test2 = (komoran.get_plain_text(answer[0])).split(' ') - for j in range(len(test2)): - temp = test2[j].split('/') - test2[j] = temp[0] - #print('여기',test) - test2 = ' '.join(test2) - tfidf_vectorizer = TfidfVectorizer() - tfidf_matrix = tfidf_vectorizer.fit_transform([test, test2]) + contents_preprocessed = preprocess_text(contents[0]) + answer_preprocessed = preprocess_text(answer[0]) + + tfidf_matrix = tfidf_vectorizer.fit_transform([contents_preprocessed, answer_preprocessed]) - # LSA를 사용하여 차원 축소 lsa = TruncatedSVD(n_components=2) lsa_matrix = lsa.fit_transform(tfidf_matrix) - # 문장 간 유사도 계산 similarity_matrix = cosine_similarity(lsa_matrix) response = {