SentimentAnalysisReddit_GPU.py

# -*- coding: utf-8 -*-
"""Sentiment140Reddit.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1p1ZoE-cSRKjo_ulwoJcBKVmpm4dnLb2d
"""

#pip install transformers

import tensorflow as tf
import pandas as pd
from mega import Mega
from os.path import exists

if(not exists("my_model/")):
    mega = Mega()
    m = mega.login()
    try:
        m.download_url('https://mega.nz/file/0eRhiC5I#l6_lGhdE5P7DBdr_MW8IRerCLjcuM-tgaQg-VQvEXA8')
        import tarfile 
        file = tarfile.open('mymodel.tar.gz') 
          
        # extracting file 
        file.extractall('./') 
        file.close()
    except:
        pass

from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("./my_model")
tokenizer = BertTokenizer.from_pretrained("./my_model")

#pip install praw
 
import praw
import pandas as pd
 
reddit = praw.Reddit(
    client_id="QSmOVScqR5jgFjis7ps7mw",
    client_secret="v8up9w5kl4HJFWwDCjsOdn5x5Uyghw",
    password="Lsk3DnRATYWX99-",
    user_agent="testscript by u/franz1020",
    username="franz1020",
)

import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

while True:
  choice = input("Inserisci:\n1 Per esaminare i commenti di un post\n2 Per esaminare i top post di un subreddit:\n") 

  if choice == '1':
    # PRENDO UN POST E VEDO I COMMENTI
    URL = input("Inserisci url del post:\n") 
    try:
      submission = reddit.submission(url=URL)
      comments = []
      submission.comments.replace_more(limit=0)
    
      for c in submission.comments:
          comments.append([c.body])

    except Exception:
        pass

    comments = pd.DataFrame(comments,columns=['data'])
    break

  elif choice == '2':
    # prendo i pos#t di un subreddit
    chosen_subreddit = input("Inserisci il nome del subreddit:\n") 

    posts = []
    subreddit = reddit.subreddit(chosen_subreddit)

    for post in subreddit.hot(limit=200):
        posts.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
    posts = pd.DataFrame(posts,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])

    # PRENDO I TOP POST
    comments = []
    for i in range(posts.size):
      # Serve nel caso di URL non validi i quali vanno SKIPPATI
      try:
        comments.append([posts.get('title')[i]+" "+posts.get('body')[i]])
      except Exception: 
        pass 

    comments = pd.DataFrame(comments,columns=['data'])
    break

  else:
    print("Scelta non corretta")

import re
import string 

def cleanText(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text)
  text = re.sub(r'#', '', text)
  text = re.sub(r'[@[A-Za-z0-9]+]', '', text)
  text = re.sub(r'\s\(\s', '', text)
  text = re.sub(r'\s\($', '', text)
  text = re.sub(r'https?:\/\/\S+', '', text)
  return text

comments['data'] = comments['data'].apply(cleanText)

for i in range(len(comments)):
  if "[deleted]" in comments['data'][i] or "[removed]" in comments['data'][i] or '?' in comments['data'][i] or len(comments['data'][i]) < 3:
    comments = comments.drop(i)

import re

def formatString(sentence):
  sentence = re.sub("(.{60})", "\\1\n", sentence, 0, re.DOTALL)
  return sentence

pred_sentences = comments["data"].values.tolist()

tf_batch = tokenizer(pred_sentences, max_length=256, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative', 'Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()

pred_labels = []
for i in range(len(pred_sentences)):
  pred_labels.append(labels[label[i]])
  pred_sentences[i] = formatString(pred_sentences[i])


data = {'Comment':pred_sentences,
        'Label':pred_labels}
        
df = pd.DataFrame(data)
from tabulate import tabulate
print(tabulate(df, headers = 'keys', tablefmt = 'fancy_grid'))

import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(pred_labels)
plt.show()