psfeatureextractor.py

# -*- coding: utf-8 -*-
"""psfeatureextractor.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1HhEiN2Wmbk9Y53Dvt2Lw804FPRlkdBdP
"""

# -*- coding: utf-8 -*-
"""PSFeatureExtractor.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1-zwppOfr0Cr1k15U5cOBA8Kd3xdKSXWr
"""

# -*- coding: utf-8 -*-
"""BaseLineWithGridSearch.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1cxXw92aQZKvWJGmOdKIYGGXRMpK_XNvQ

# Import Required Libraries
"""
import nltk
from nltk import word_tokenize as nltk_word_tokenize
import string
import pandas as pd
import re
import numpy as np
from pathlib import Path
from google.colab import drive
import torch
import stanfordnlp
import itertools
import sklearn
from sklearn.model_selection import train_test_split
from hazm import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import SequenceMatcher
import warnings
warnings.filterwarnings('ignore')
from openpyxl import load_workbook
import joblib
from sklearn.feature_extraction.text import CountVectorizer
import os.path

nltk.download('punkt')

refute_hedge_reporte_words = ['جعلی',
                   'تقلب',
                   'فریب',
                   'حیله',
                   'کلاهبرداری',
                   'شیادی',
                   'دست انداختن',
                   'گول زدن',
                   'نادرست',
                   'غلط',
                   'کذب',
                   'ساختگی',
                   'قلابی',
                   'انکار',
                   'رد',
                   'تکذیب',
                   'تکذیب کردن',
                   'تکذیب شد',
                   'انکار کردن'
                   'انکار می کند',
                   'نه',
                   'با وجود',
                   'علیرغم',
                   'با اینکه',
                   'شک داشتن',
                   'تردید کردن',
                   'مظنون بودن',
                   'شک',
                   'تردید',
                   'دو دلی',
                   'گمان',
                   'به گزارش'
                   ,'ادعا شده'
                   ,'به قول معروف'
                   ,'بنا به گفته'
                   , 'ظاهرا'
                   ,'به نظر می رسد'
                   ,'ادعا'
                   ,'میتوانست'
                   ,'می تواند'
                   ,'از قرار معلوم'
                   ,'مشخصا'
                   ,'تا حد زیادی'
                   ,'احتمال دارد'
                   ,'شاید'
                   ,'به طور عمده'
                   ,'ممکن است'
                   ,'گویا'
                   ,'ممکن'
                   ,'اغلب'
                   ,'غالبا'
                   ,'احتمالا'
                   ,'احتمالاً'
                   ,'محتملا'
                   ,'گفته شده'
                   ,'گزارش داد'
                   ,'طبق گزارش'
                   ,'شایعه'
                   ,'شایعات'
                   ,'شایعه شده'
                   ,'قدری'
                   ,'تا حدی'
                   ,'تأیید نشده'
]

if __name__ == "__main__":
   # stuff only to run when not called via 'import' here
   pass

"""# Define Class"""

# -------------------------------------
class PSFeatureExtractor():
  # -------------------------------------
  def __init__(self, dataset_path, stopWord_path, polarity_dataset_path, stanford_models_path , use_google_drive = True, important_words = None):
    self.dataset_path = dataset_path
    self.stopWord_path = stopWord_path
    self.polarity_dataset_path = polarity_dataset_path
    self.use_google_drive = use_google_drive
    self.stanford_models_path = stanford_models_path
    self.important_words = important_words
    self.clean_claims_headlines = []
    self.clean_claims = []
    self.clean_headlines = []
    if use_google_drive:
      from google.colab import drive
      drive.mount('/content/drive')

    self.fa_stop_words = self.__get_stop_words()
    self.claims, self.headlines, self.isQuestion,self.hasTowParts, self.labels, self.dataset = self.__read_dataset()
    self.fa_punctuations = ['،','«','»',':','؛','ْ','ٌ','ٍ','ُ','ِ','َ','ّ','ٓ','ٰ','-','*']
  # -------------------------------------
  def __get_stop_words(self):   
      normalizer = Normalizer()
      lineList = list()
      print(self.stopWord_path)
      with open(self.stopWord_path) as f:
        for line in f:
          lineList.append(normalizer.normalize(line.rstrip("\n\r")))
      return lineList
  # ---------------------------------------------------
  def clean_sentence(self, sentence):
    normalizer = Normalizer()
    shayee = normalizer.normalize("شایعه")
    clean_sentences = sentence
    re_pattern1 = "(/(\s)*"+ shayee +"(\s)*[0-9]+)|(/(\s)*شایعه(\s)*[0-9]+)"
    re_pattern2 = "/(\s)*[0-9]+"
    re_pattern3 = "\\u200c|\\u200d|\\u200e|\\u200b|\\u2067|\\u2069"
    x = re.search(re_pattern1, sentence)
    if (x):
      clean_sentences = re.sub(re_pattern1, "", sentence)

    x = re.search(re_pattern2, clean_sentences)
    if (x):
      clean_sentences = re.sub(re_pattern2, "", clean_sentences)
        
    x = re.search(re_pattern3, clean_sentences)
    if (x):
      clean_sentences = re.sub(re_pattern3, "", clean_sentences)   
        
    punc_regex = re.compile('|'.join(map(re.escape, list(string.punctuation) + list(self.fa_punctuations))))

    clean_sentences = punc_regex.sub("", clean_sentences)

    return clean_sentences
  # ---------------------------------------------
  def __read_dataset(self):
    df = pd.read_csv(self.dataset_path, encoding = 'utf-8')
    claims = df['claim'].values
    headlines = df['headline'].values
    isQuestion = df['IsQuestion'].values
    hasTowParts = df['HasTowParts'].values
    labels = df['label'].values
    assert (claims.shape == headlines.shape == isQuestion.shape == labels.shape == hasTowParts.shape), "The features size are not equal."
    print(claims.shape , headlines.shape ,isQuestion.shape,hasTowParts.shape ,labels.shape)
    return claims, headlines,isQuestion,hasTowParts ,labels, df
  # ---------------------------------------
  def stanford_tokenize(self, root_model_path, just_get_tokenized_words = False): 
    
    nlp = stanfordnlp.Pipeline(lang='fa', models_dir= self.stanford_models_path, treebank=None, use_gpu=True) 
    # nlp = stanfordnlp.Pipeline(processors='tokenize,lemma', lang='fa', treebank=None, use_gpu=True)

    claims_processors_result = []
    headlines_processors_result = []
    claims_tokenize = []
    headlines_tokenize = []

    for i, (claim,headline) in enumerate(zip(self.claims,self.headlines)):
      clean_claim = self.clean_sentence(claim)
      self.clean_claims.append(clean_claim)
      doc = nlp(clean_claim) # Run the pipeline on input text
      claims_processors_result.append(doc.sentences[0].words)
      words = (obj.text for obj in doc.sentences[0].words)
      claims_tokenize.append(words)

      # headline
      clean_headline = self.clean_sentence(headline)
      self.clean_headlines.append(clean_headline)
      doc = nlp(clean_headline) # Run the pipeline on input text
      headlines_processors_result.append(doc.sentences[0].words)
      words = (obj.text for obj in doc.sentences[0].words)
      headlines_tokenize.append(words)

      self.clean_claims_headlines.append(clean_claim + ' ' + clean_headline)

    self.tokens_claims , self.tokens_headlines = self.clean_tokens(target_list = claims_tokenize), self.clean_tokens(target_list = headlines_tokenize)   
    if just_get_tokenized_words :                
      return self.tokens_claims , self.tokens_headlines 

    return claims_processors_result , headlines_processors_result
  # ------------------------------------------------
  def clean_tokens(self, target_list):
    assert isinstance(target_list, (list)) == True , "Type of target_list is not correct. It has to be list."
    normalizer = Normalizer()

    denide_words = self.fa_stop_words + list(string.punctuation) + list(self.fa_punctuations)
            
    clean_words = []

    for item in target_list:
      clean_words.append([i for i in item if normalizer.normalize(i) not in denide_words])

    return clean_words
  # --------------------------------------------------
  def hazm_tokenize(self):
    claims_result = []
    headlines_result = []
    
    for claim,headline in zip(self.claims,self.headlines):
      clean_claim = self.clean_sentence(claim)
      self.clean_claims.append(clean_claim)
      claims_result.append(word_tokenize(clean_claim))
      # headline
      clean_headline = self.clean_sentence(headline)
      self.clean_headlines.append(clean_headline)
      headlines_result.append(word_tokenize(clean_headline))

      self.clean_claims_headlines.append(clean_claim + ' ' + clean_headline)

    self.tokens_claims , self.tokens_headlines = self.clean_tokens(target_list = claims_result), self.clean_tokens(target_list = headlines_result)
    return self.tokens_claims , self.tokens_headlines

  # --------------------------------------------------
  def nltk_tokenize(self):
    claims_result = []
    headlines_result = []
    
    for claim,headline in zip(self.claims,self.headlines):
      clean_claim = self.clean_sentence(claim)
      self.clean_claims.append(clean_claim)
      claims_result.append(nltk_word_tokenize(clean_claim))
      
      # headline
      clean_headline = self.clean_sentence(headline)
      self.clean_headlines.append(clean_headline)
      headlines_result.append(nltk_word_tokenize(clean_headline))

      self.clean_claims_headlines.append(clean_claim + ' ' + clean_headline)
    self.tokens_claims , self.tokens_headlines = self.clean_tokens(target_list = claims_result), self.clean_tokens(target_list = headlines_result)
    return self.tokens_claims , self.tokens_headlines
  # --------------------------------------------------
  def tf_idf(self):
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10, norm='l2', ngram_range=(1, 2))
    features = tfidf.fit_transform(self.clean_claims_headlines).toarray() 
    return features
  # --------------------------------------------------
  def similarity(self):
    feature = []
    for i, (claim,headline) in enumerate(zip(self.clean_claims,self.clean_headlines)):
      ratio = SequenceMatcher(None, claim, headline).ratio()
      quick_ratio = SequenceMatcher(None, claim, headline).quick_ratio()
      real_quick_ratio = SequenceMatcher(None, claim, headline).real_quick_ratio()
      feature.append([ratio,quick_ratio,real_quick_ratio])
    return feature    
  # --------------------------------------------------
  def calc_important_words(self):
    assert (self.important_words != None), 'For calculating important words you should pass important words in initializer.'
    features = np.zeros((len(self.clean_claims_headlines), len(self.important_words)))
    for i in range(len(self.clean_claims_headlines)):
      for j in range(len(self.important_words)):
        if self.important_words[j] in self.clean_claims_headlines[i]:
            features[i][j] = 1
    return features
  # --------------------------------------------------
  def calculate_root_distance(self ,target_sentences = None): # target_sentences = clean_headlines
    
    if target_sentences == None:
      target_sentences = self.clean_headlines
    
    nlp = stanfordnlp.Pipeline(lang='fa', models_dir= self.stanford_models_path, treebank=None, use_gpu=True) 
    root_distance_feature = np.zeros((len(target_sentences),1))
    for index,headline in enumerate(target_sentences):
      root_distance_feature[index] = -1
      doc = nlp(headline)
      root = [(i,doc.sentences[0].words[i].text) for i in range(len(doc.sentences[0].words)) if  doc.sentences[0].words[i].dependency_relation == 'root' ]
      if(len(root) == 0):
        continue

      root_index,root_word = root[0]

      for word_index,word in enumerate(headline.split()) :
        target = [(i,refute_hedge_reporte_words[i]) for i in range(len(refute_hedge_reporte_words)) if  refute_hedge_reporte_words[i] == word]
        if(len(target) > 0):
          target_index, target_word =target[0]
          root_distance_feature[index] = abs(word_index - root_index)
          break
    return root_distance_feature
  # --------------------------------------------------
  def load_polarity_dataset(self):
    excel = load_workbook(filename = self.polarity_dataset_path)
    sheet = excel.active
    words_polarity_fa={}
    for row in sheet.iter_rows():
      if row[2].value == "Polarity" or row[2].value == None:
        continue
      words_polarity_fa[row[0].value] = row[2].value
    return words_polarity_fa  
  # --------------------------------------------------
  def calculate_polarity(self, target_sentences = None):
    words_polarity_fa = self.load_polarity_dataset()

    if target_sentences == None:
      target_sentences = zip(self.tokens_claims,self.tokens_headlines)

    # unzipping values 
    mapped = list(target_sentences)   
    claims,headlines = zip(*mapped) 
    claims_array = np.asarray(claims)
    polarity_vector = np.zeros((len(claims_array), 30))

    for i,(claim,headline) in enumerate(zip(claims,headlines)):
      j = 0
      while j < len(claim) and j< 15:
        if claim[j] in words_polarity_fa:
          polarity_vector[i][j] = words_polarity_fa[claim[j]]
        j += 1
      j = 0
      while j < len(headline) and j< 15:
        if headline[j] in words_polarity_fa:
          polarity_vector[i][j+15] = words_polarity_fa[headline[j]]          
        j += 1
    return polarity_vector
  # --------------------------------------------------
  # Function to average all word vectors in a paragraph
  def __feature_vector_method(self, words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float32")
    nwords = 0
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for nwords, word in enumerate(words):
      if word in index2word_set:
        featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec    
  # --------------------------------------------------
  def get_w2v_feature(self, model, num_features, target_sentences = None):
    
    if target_sentences == None:
      target_sentences = self.clean_claims_headlines

    reviewFeatureVecs = np.zeros((len(target_sentences),num_features),dtype="float32")
    for counter, sentence in enumerate(target_sentences):
      # Printing a status message every 10000th review
      if counter%1000 == 0:
        print("data %d of %d"%(counter,len(target_sentences)))
          
      reviewFeatureVecs[counter] = self.__feature_vector_method(sentence, model, num_features)
        
    return reviewFeatureVecs
  # --------------------------------------------------
  def get_bow(self, target_sentences = None):
    if target_sentences == None:
      target_sentences = self.clean_claims_headlines
    vectorizer = CountVectorizer(ngram_range=(1, 2))
    X = vectorizer.fit_transform(target_sentences)
    return X.toarray()
  # --------------------------------------------------
  def generate_Features(self, w2v_model_path, save_path, load_path, save_feature = False, load_if_exist = True, similarity = True, important_words = True, is_question = True, more_than2_parts = True, root_distance = True, polarity = True , w2v = True , bow = True ,tfidf = True):
    features = self.isQuestion
    features = np.reshape(features,(len(features),1))
    file_name = ''

    if load_if_exist == True or save_feature == True:
      if tfidf:
        file_name += 'tfidf_'
      if similarity:
        file_name += 'similarity_'
      if important_words:
        file_name += 'important_words_'        
      if is_question:
        file_name += 'is_question_'
      if more_than2_parts:
        file_name += 'more_than2_parts_'
      if root_distance:
        file_name += 'root_distance_'
      if polarity:
        file_name += 'polarity_'    
      if w2v:
        file_name += 'w2v_'     
      if bow:
        file_name += 'bow_'    


    if load_if_exist :
      assert len(load_path) > 0, "Please enter load_path."
      load_file_name = load_path + '/' + file_name + '.pkl'
      if os.path.isfile(load_file_name) == True :
        features = joblib.load(load_file_name)
        print('Features loaded successfully.')
        return features, file_name
      else:
        print('Features vector file is not exist.')      
    # -------------- tfidf ----------
    if tfidf:
      print('Start to generate tf_idf feature')
      tf_idf_feature = self.tf_idf()
      features = np.append(features, tf_idf_feature ,axis = 1)
      print('End of tf_idf feature')
    # -------------- similarity ----------
    if similarity:
      print('Start to generate similarity feature')
      similarity_feature = self.similarity()
      features = np.append(features, similarity_feature ,axis = 1)
      print('End of similarity feature')
    # -------------- important words ----------
    if important_words:
      print('Start to generate important words feature')
      important_words_feature = self.calc_important_words()
      features = np.append(features, important_words_feature ,axis = 1)
      print('End of important words feature')
    # -------------- is question ----------
    if is_question == False:
      features = features[:,1:]
    else:
      print('"is question" feature was added.')
    # -------------- more than tow parts ----------
    if more_than2_parts:
      features = np.append(features, np.reshape(self.hasTowParts, (len(self.hasTowParts),1)) ,axis = 1)
      print('"more than tow parts" feature was added.')
    # -------------- root distance ----------
    if root_distance:
      print('Start to generate root distance feature')
      root_distance_feature = self.calculate_root_distance()
      features = np.append(features, root_distance_feature ,axis = 1)
      print('End of root distance feature')
    # -------------- root distance ----------
    if polarity:
      print('Start to generate polarity feature')
      polarity_feature = self.calculate_polarity()
      features = np.append(features, polarity_feature ,axis = 1)
      print('End of polarity feature')    
    # -------------- w2v ----------
    if w2v:
      print('Start to generate w2v feature')
      assert len(w2v_model_path) > 0, "Please enter w2v_model_path."
      w2v_model = joblib.load(w2v_model_path)
      w2v_feature = self.get_w2v_feature(w2v_model, num_features = 300)
      w2v_feature = (w2v_feature - np.min(w2v_feature))/ (np.max(w2v_feature) - np.min(w2v_feature))
      features = np.append(features, w2v_feature ,axis = 1)
      print('End of w2v feature')   
    # -------------- bow ----------
    if bow:
      print('Start to generate bow feature')
      bow_feature = self.get_bow()
      features = np.append(features, bow_feature ,axis = 1)
      print('End of bow feature')          
    
    if save_feature:
      joblib.dump(features, (save_path + '/' + file_name + '.pkl'))
      print('Features saved successfully.')          
    return features, file_name