Skip to content

Commit

Permalink
Merge pull request #60 from tharoosha/dev_rate
Browse files Browse the repository at this point in the history
Dev rate
  • Loading branch information
tharoosha authored Oct 30, 2023
2 parents ad81776 + 9845cd9 commit cdd45d8
Show file tree
Hide file tree
Showing 9 changed files with 1,364 additions and 0 deletions.
648 changes: 648 additions & 0 deletions app.ipynb

Large diffs are not rendered by default.

63 changes: 63 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
from flask import Flask, request, jsonify
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier

app = Flask(__name__)

# Load the model and data (you can load them as globals)
df = pd.read_csv('title_category.csv')
df = df.rename(columns={'Unnamed: 0': 'Index'})
df = df.dropna(subset=['Index'])
df['Type of Video'] = df['Type of Video'].str.replace('__##__', ',')

tf_idf = TfidfVectorizer(min_df=2, max_features=None, strip_accents='unicode',
norm='l2', analyzer='char', token_pattern=r'\w{1,}', ngram_range=(1, 5),
use_idf=1, smooth_idf=1, stop_words='english')
features = tf_idf.fit_transform(df['Title of the video']).toarray()
embedder = SentenceTransformer('bert-base-nli-mean-tokens')
semantic_embedder = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
bert_features = embedder.encode(df['Title of the video'].tolist())
semantic_bert_features = semantic_embedder.encode(df['Title of the video'].tolist())
final_features = np.hstack((features, bert_features, semantic_bert_features))

df['Type of Video'].fillna('', inplace=True)
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(','))
y = vectorizer.fit_transform(df['Type of Video'])
clf = MultiOutputClassifier(SGDClassifier(max_iter=4000)).fit(final_features, y.toarray())

def generate_embedding(text):
word_transform = tf_idf.transform([text]).toarray()[0]
bert_transform = embedder.encode([text], show_progress_bar=False)[0]
semantic_bert_transform = semantic_embedder.encode([text], show_progress_bar=False)[0]
embedding = np.hstack((word_transform, bert_transform, semantic_bert_transform))
return embedding

def get_terms(pred_list):
return [w.title() for w in vectorizer.inverse_transform([pred_list])[0]]

def get_topics(text):
text_embedding = generate_embedding(text)
pred_list = clf.predict([text_embedding])[0]
return get_terms(pred_list)

@app.route('/classify', methods=['POST'])
def classify_video():

print("Model called")
data = request.get_json()
video_title = data['video_title']

# Call the get_topics function to classify the video
categories = get_topics(video_title)

# Return the categories as a JSON response
response = {'categories': categories}
return jsonify(response)

if __name__ == '__main__':
app.run(debug=True)
Binary file added backend/ml_models/recommanded_system/model.pkl
Binary file not shown.
Binary file added backend/ml_models/recommanded_system/vectorizer.pkl
Binary file not shown.
Binary file added model.pkl
Binary file not shown.
Binary file added multioutput_classifier_model.pkl
Binary file not shown.
21 changes: 21 additions & 0 deletions res.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import requests

# URL of your API
url = 'http://127.0.0.1:5000/classify'

# Video title you want to classify
video_title = "Eric Weinstein: Revolutionary Ideas in Science, Math, and Society | Artificial Intelligence Podcast"

# Prepare the data as a JSON payload
data = {'video_title': video_title}

# Send the POST request
response = requests.post(url, json=data)

# Check the response
if response.status_code == 200:
# The response should contain the categories
categories = response.json()['categories']
print("Categories:", categories)
else:
print("Error:", response.text)
632 changes: 632 additions & 0 deletions title_category.csv

Large diffs are not rendered by default.

Binary file added vectorizer.pkl
Binary file not shown.

0 comments on commit cdd45d8

Please sign in to comment.