retrieve_docs_simple_Export.py


from    elasticsearch import Elasticsearch
from    sklearn.preprocessing import StandardScaler
import  os, json, random, re, nltk
import  torch
import  torch.nn.functional         as F
import  torch.nn                    as nn
import  numpy                       as np
import  pickle
import  torch.autograd              as autograd
from    tqdm                        import tqdm
from    gensim.models.keyedvectors  import KeyedVectors
from    nltk.tokenize               import sent_tokenize
from    difflib                     import SequenceMatcher

bioclean    = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip().lower()).split()
softmax     = lambda z: np.exp(z) / np.sum(np.exp(z))

similar_sets = [
set(['assessed', 'evaluated', 'analyzed', 'analysed', 'measured', 'examined']),
set(['found', 'observed']),
set(['database', 'databank']),
set(['genome', 'genome assembly']),
set(['xbp', 'xbps', 'atf']),
set(['coacervation', 'phase separation']),
set(['mitochondrion', 'mitochondria']),
set(['asians', 'caucasians', 'europeans']),
set(['breastfeeding', 'breast feeding', 'exclusive breastfeeding', 'childbirth']),
set(['activates', 'antagonizes']),
set(['mutation', 'missense mutation', 'pathogenic variant', 'nonsense mutation', 'gene mutation', 'point mutation', 'missense variant', 'deletion', 'frameshift mutation', 'novel mutation']),
set(['genes', 'gene families']),
set(['vaccine', 'influenza vaccine', 'dryvax', 'influenza vaccines', 'vaccines', 'recombinant vaccine']),
set(['monoclonal', 'polyclonal']),
set(['dickkopf', 'dkk']),
set(['expression', 'protein expression', 'expression level', 'mrna expression']),
set(['fibroids', 'myomas']),
set(['microduplication', 'microdeletion', 'q deletion', 'pathogenic variant']),
set(['alzheimers', 'alzheimer', 'huntingtons', 'alzheimer disease', 'alzheimers disease']),
set(['oncomine', 'gepia']),
set(['antibody', 'polyclonal', 'mab']),
set(['glioblastoma', 'gbm', 'glioma', 'glioblastoma multiforme', 'non small cell lung carcinoma', 'neuroblastoma', 'malignant gliomas', 'non small cell lung cancer nsclc', 'npc', 'triple negative breast cancer', 'sclc']),
set(['nanog', 'sox']),
set(['effect', 'effects']),
set(['abacavir', 'atazanavir', 'nevirapine']),
set(['endoderm', 'mesoderm', 'ectoderm', 'ureteric bud']),
set(['cyp', 'cyps', 'ugt', 'cytochrome p', 'cypa']),
set(['fingolimod', 'natalizumab', 'nintedanib']),
set(['pain', 'back pain']),
set(['promotes', 'accelerates', 'suppresses', 'restrains', 'augments']),
set(['located', 'situated']),
set(['rbioconductor', 'web tool', 'r package']),
set(['antagonists', 'agonists']),
set(['polymorphism', 'rs', 'genetic variant', 'rsag', 'genetic polymorphism']),
set(['pharmacogenetic', 'pharmacogenomic']),
set(['able', 'unable']),
set(['yeast', 'saccharomyces cerevisiae', 's cerevisiae']),
set(['used', 'utilized', 'employed', 'utilised', 'applied']),
set(['describe', 'discuss']),
set(['prevalence', 'prevalence rate']),
set(['efavirenz', 'stavudine', 'nevirapine', 'ritonavir', 'darunavir', 'rilpivirine', 'raltegravir', 'dolutegravir', 'tenofovir', 'zidovudine']),
set(['two', 'three', 'four']),
set(['trastuzumab', 'cetuximab', 'pertuzumab', 'erlotinib', 'panitumumab', 'afatinib', 'pembrolizumab', 'docetaxel', 'olaparib', 'lapatinib', 'atezolizumab']),
set(['autosomal', 'autosomal dominant', 'autosomal recessive']),
set(['azd', 'abt', 'panobinostat', 'chidamide', 'lonafarnib', 'icotinib', 'selinexor', 'gefitinib']),
set(['ofatumumab', 'alemtuzumab', 'ibrutinib', 'rituximab', 'denileukin diftitox', 'avelumab']),
set(['treatment', 'therapy']),
set(['ewings', 'ewing']),
set(['cypc', 'cypd', 'ugta', 'ugtb', 'vkorc']),
set(['positive', 'negative']),
set(['quantification', 'quantitation']),
set(['cgrp', 'galanin']),
set(['antibodies', 'igg antibodies', 'antisera']),
set(['effective', 'efficacious']),
set(['fda', 'us food and drug administration', 'us fda', 'european medicines agency', 'food and drug administration']),
set(['towards', 'toward']),
set(['propofol', 'remifentanil', 'dexmedetomidine', 'sevoflurane', 'desflurane', 'nitroglycerin', 'nicardipine', 'isoflurane', 'esmolol']),
set(['mellifera', 'apis mellifera']),
set(['recommendations', 'guidelines', 'clinical guidelines']),
set(['acupotomy', 'sgb']),
set(['molecules', 'ligands']),
set(['p', 'p≤']),
set(['clcn', 'kcnh', 'scna', 'atpa', 'kcnj']),
set(['proteins', 'polypeptides']),
set(['explain', 'reflect', 'accentuate', 'signify', 'downplay']),
set(['southern', 'northern', 'northeastern', 'southwestern', 'southeastern', 'northwestern', 'northeast', 'southwest', 'slovakia', 'southeast', 'north']),
set(['critical', 'crucial', 'important', 'essential', 'vital']),
set(['bacteria', 'microorganisms', 'microbes', 'fungi']),
set(['methodologies', 'approaches']),
set(['infection', 'salmonella infection']),
set(['zolmitriptan', 'solifenacin succinate']),
set(['hypofractionated', 'intensity modulated radiation therapy', 'whole breast', 'intensity modulated radiotherapy', 'normofractionated', 'hdr brachytherapy']),
set(['adalimumab', 'infliximab', 'etanercept', 'ustekinumab', 'golimumab', 'tcz', 'tocilizumab', 'vedolizumab', 'omalizumab', 'secukinumab', 'tofacitinib']),
set(['vesicles', 'membranes']),
set(['dexamethasone', 'dxm', 'dex']),
set(['cytokine', 'inflammatory cytokine']),
set(['domestic', 'feral']),
set(['package', 'spreadsheet']),
set(['aim', 'purpose', 'objective', 'objectives']),
set(['lncrnas', 'circrnas', 'mirnas', 'ncrnas', 'lincrnas', 'trna derived fragments', 'micrornas', 'mirs']),
set(['radiotherapy', 'radiation therapy', 'radiation treatment', 'external beam radiotherapy', 'external beam radiation therapy', 'palliative radiotherapy', 'concurrent chemoradiation', 'radiosurgery', 'chemoradiation', 'sbrt', 'whole brain radiotherapy']),
set(['september', 'july', 'august', 'october', 'march', 'november', 'february', 'june', 'april', 'december', 'january']),
set(['pseudotumor', 'pseudotumour']),
set(['pose', 'impose']),
set(['codeine', 'pseudoephedrine']),
set(['extracellular', 'intracellular']),
set(['children', 'adults', 'persons']),
set(['rhabdomyolysis', 'hyperammonemia', 'hyponatremia', 'kernicterus', 'hepatic encephalopathy']),
set(['hypercapnia', 'hypocapnia', 'hypercapnic']),
set(['color', 'colour']),
set(['polymorphisms', 'genetic polymorphisms', 'gene variants', 'single nucleotide polymorphisms']),
set(['among', 'amongst']),
set(['dipg', 'diffuse intrinsic pontine glioma']),
set(['foam', 'fabric']),
set(['sarcoma', 'rhabdomyosarcoma', 'lymphoma']),
set(['offers', 'provides', 'justifies', 'holds']),
set(['migraine', 'tension type headache', 'cluster headache', 'migraine with aura', 'migraines', 'migraine without aura']),
set(['rodents', 'nonhuman primates', 'humans']),
set(['recognized', 'recognised', 'regarded']),
set(['cyld', 'wwp', 'senp', 'skp', 'trim', 'eifa', 'smurf', 'famb']),
set(['variants', 'missense variants', 'novel variants', 'mutations']),
set(['kisspeptin', 'gnrh neurons', 'prrp']),
set(['basaloid', 'pleomorphic']),
set(['mother', 'father']),
set(['ligands', 'molecules']),
set(['panitumumab', 'pertuzumab', 'trastuzumab', 'ramucirumab', 'cetuximab']),
set(['toxoplasmosis', 'lyme disease', 'brucellosis', 'leptospirosis', 'strongyloidiasis']),
set(['repressed', 'regulated']),
set(['dkk', 'dickkopf', 'sfrp', 'activin a']),
set(['tumors', 'tumours', 'carcinomas', 'sarcomas', 'neoplasms']),
set(['regarding', 'concerning'])
]

# Compute the term frequency of a word for a specific document
def tf(term, document):
    tf = 0
    for word in document:
        if word == term:
            tf += 1
    if len(document) == 0:
        return tf
    else:
        return tf/len(document)

# Use BM25 ranking function in order to cimpute the similarity score between a question anda snippet
# query: the given question
# document: the snippet
# k1, b: parameters
# idf_scores: list with the idf scores
# avddl: average document length
# nomalize: in case we want to use Z-score normalization (Boolean)
# mean, deviation: variables used for Z-score normalization
def similarity_score(query, document, k1, b, idf_scores, avgdl, normalize, mean, deviation, rare_word):
    score = 0
    for query_term in query:
        if query_term not in idf_scores:
            score += rare_word * (
                    (tf(query_term, document) * (k1 + 1)) /
                    (
                            tf(query_term, document) +
                            k1 * (1 - b + b * (len(document) / avgdl))
                    )
            )
        else:
            score += idf_scores[query_term] * ((tf(query_term, document) * (k1 + 1)) / (tf(query_term, document) + k1 * (1 - b + b * (len(document) / avgdl))))
    if normalize:
        return ((score - mean)/deviation)
    else:
        return score

# Compute the average length from a collection of documents
def compute_avgdl(documents):
    total_words = 0
    for document in documents:
        total_words += len(document)
    avgdl = total_words / len(documents)
    return avgdl

def weighted_binary_cross_entropy(output, target, weights=None):
    if weights is not None:
        assert len(weights) == 2
        loss = weights[1] * (target * torch.log(output)) + weights[0] * ((1 - target) * torch.log(1 - output))
    else:
        loss = target * torch.log(output) + (1 - target) * torch.log(1 - output)
    return torch.neg(torch.mean(loss))

def RemoveTrainLargeYears(data, doc_text):
  data['queries'] = [q for q in data['queries'] if (len(q['retrieved_documents']) > 0)]
  for i in tqdm(range(len(data['queries']))):
    hyear = 1900
    for j in range(len(data['queries'][i]['retrieved_documents'])):
      if data['queries'][i]['retrieved_documents'][j]['is_relevant']:
        doc_id = data['queries'][i]['retrieved_documents'][j]['doc_id']
        year = doc_text[doc_id]['publicationDate'].split('-')[0]
        if year[:1] == '1' or year[:1] == '2':
          if int(year) > hyear:
            hyear = int(year)
    # if(len(data['queries'][i]['retrieved_documents'])>0):
    j = 0
    while True:
      doc_id    = data['queries'][i]['retrieved_documents'][j]['doc_id']
      year      = doc_text[doc_id]['publicationDate'].split('-')[0]
      if (year[:1] == '1' or year[:1] == '2') and int(year) > hyear:
        del data['queries'][i]['retrieved_documents'][j]
      else:
        j += 1
      if j == len(data['queries'][i]['retrieved_documents']):
        break
  return data

def RemoveBadYears(data, doc_text, train):
  for i in range(len(data['queries'])):
    j = 0
    while True:
      doc_id    = data['queries'][i]['retrieved_documents'][j]['doc_id']
      year      = doc_text[doc_id]['publicationDate'].split('-')[0]
      ##########################
      # Skip 2017/2018 docs always. Skip 2016 docs for training.
      # Need to change for final model - 2017 should be a train year only.
      # Use only for testing.
      if year == '2017' or year == '2018' or (train and year == '2016'):
      #if year == '2018' or (train and year == '2017'):
        del data['queries'][i]['retrieved_documents'][j]
      else:
        j += 1
      ##########################
      if j == len(data['queries'][i]['retrieved_documents']):
        break
  return data

def print_params(model):
    '''
    It just prints the number of parameters in the model.
    :param model:   The pytorch model
    :return:        Nothing.
    '''
    print(40 * '=')
    print(model)
    print(40 * '=')
    trainable       = 0
    untrainable     = 0
    for parameter in model.parameters():
        # print(parameter.size())
        v = 1
        for s in parameter.size():
            v *= s
        if(parameter.requires_grad):
            trainable   += v
        else:
            untrainable += v
    total_params = trainable + untrainable
    print(40 * '=')
    print('trainable:{} untrainable:{} total:{}'.format(trainable, untrainable, total_params))
    print(40 * '=')

def dummy_test():
    doc1_embeds         = np.random.rand(40, 200)
    doc2_embeds         = np.random.rand(30, 200)
    question_embeds     = np.random.rand(20, 200)
    q_idfs              = np.random.rand(20, 1)
    gaf                 = np.random.rand(4)
    baf                 = np.random.rand(4)
    for epoch in range(200):
        optimizer.zero_grad()
        cost_, doc1_emit_, doc2_emit_ = model(
            doc1_embeds     = doc1_embeds,
            doc2_embeds     = doc2_embeds,
            question_embeds = question_embeds,
            q_idfs          = q_idfs,
            gaf             = gaf,
            baf             = baf
        )
        cost_.backward()
        optimizer.step()
        the_cost = cost_.cpu().item()
        print(the_cost, float(doc1_emit_), float(doc2_emit_))
    print(20 * '-')

def compute_the_cost(costs, back_prop=True):
    cost_ = torch.stack(costs)
    cost_ = cost_.sum() / (1.0 * cost_.size(0))
    if(back_prop):
        cost_.backward()
        optimizer.step()
        optimizer.zero_grad()
    the_cost = cost_.cpu().item()
    return the_cost

def back_prop(batch_costs, epoch_costs, batch_acc, epoch_acc):
    batch_cost = sum(batch_costs) / float(len(batch_costs))
    batch_cost.backward()
    optimizer.step()
    optimizer.zero_grad()
    batch_aver_cost = batch_cost.cpu().item()
    epoch_aver_cost = sum(epoch_costs) / float(len(epoch_costs))
    batch_aver_acc  = sum(batch_acc) / float(len(batch_acc))
    epoch_aver_acc  = sum(epoch_acc) / float(len(epoch_acc))
    return batch_aver_cost, epoch_aver_cost, batch_aver_acc, epoch_aver_acc

def similar(upstream_seq, downstream_seq):
    upstream_seq    = upstream_seq.encode('ascii','ignore')
    downstream_seq  = downstream_seq.encode('ascii','ignore')
    s               = SequenceMatcher(None, upstream_seq, downstream_seq)
    match           = s.find_longest_match(0, len(upstream_seq), 0, len(downstream_seq))
    upstream_start  = match[0]
    upstream_end    = match[0]+match[2]
    longest_match   = upstream_seq[upstream_start:upstream_end]
    to_match        = upstream_seq if(len(downstream_seq)>len(upstream_seq)) else downstream_seq
    r1              = SequenceMatcher(None, to_match, longest_match).ratio()
    return r1

def get_snippets_loss(good_sent_tags, gs_emits_, bs_emits_):
    wright = torch.cat([gs_emits_[i] for i in range(len(good_sent_tags)) if (good_sent_tags[i] == 1)])
    wrong  = [gs_emits_[i] for i in range(len(good_sent_tags)) if (good_sent_tags[i] == 0)]
    wrong  = torch.cat(wrong + [bs_emits_.squeeze(-1)])
    losses = [ model.my_hinge_loss(w.unsqueeze(0).expand_as(wrong), wrong) for w in wright]
    return sum(losses) / float(len(losses))

def get_two_snip_losses(good_sent_tags, gs_emits_, bs_emits_):
    bs_emits_       = bs_emits_.squeeze(-1)
    gs_emits_       = gs_emits_.squeeze(-1)
    good_sent_tags  = torch.FloatTensor(good_sent_tags)
    tags_2          = torch.zeros_like(bs_emits_)
    if(use_cuda):
        good_sent_tags  = good_sent_tags.cuda()
        tags_2          = tags_2.cuda()
    #
    sn_d1_l         = F.binary_cross_entropy(gs_emits_, good_sent_tags, size_average=False, reduce=True)
    sn_d2_l         = F.binary_cross_entropy(bs_emits_, tags_2,         size_average=False, reduce=True)
    return sn_d1_l, sn_d2_l

def get_words(s, idf, max_idf):
    sl  = tokenize(s)
    sl  = [s for s in sl]
    sl2 = [s for s in sl if idf_val(s, idf, max_idf) >= 2.0]
    return sl, sl2

def tokenize(x):
  return bioclean(x)

def idf_val(w, idf, max_idf):
    if w in idf:
        return idf[w]
    return max_idf

def get_embeds(tokens, wv):
    ret1, ret2 = [], []
    for tok in tokens:
        if(tok in wv):
            ret1.append(tok)
            ret2.append(wv[tok])
    return ret1, np.array(ret2, 'float64')

def get_embeds_use_unk(tokens, wv):
    ret1, ret2 = [], []
    for tok in tokens:
        if(tok in wv):
            ret1.append(tok)
            ret2.append(wv[tok])
        else:
            wv[tok] = np.random.randn(embedding_dim)
            ret1.append(tok)
            ret2.append(wv[tok])
    return ret1, np.array(ret2, 'float64')

def get_embeds_use_only_unk(tokens, wv):
    ret1, ret2 = [], []
    for tok in tokens:
        wv[tok] = np.random.randn(embedding_dim)
        ret1.append(tok)
        ret2.append(wv[tok])
    return ret1, np.array(ret2, 'float64')

def load_idfs(idf_path, words):
    print('Loading IDF tables')
    #
    # with open(dataloc + 'idf.pkl', 'rb') as f:
    with open(idf_path, 'rb') as f:
        idf = pickle.load(f)
    ret = {}
    for w in words:
        if w in idf:
            ret[w] = idf[w]
    max_idf = 0.0
    for w in idf:
        if idf[w] > max_idf:
            max_idf = idf[w]
    idf = None
    print('Loaded idf tables with max idf {}'.format(max_idf))
    #
    return ret, max_idf

def uwords(words):
  uw = {}
  for w in words:
    uw[w] = 1
  return [w for w in uw]

def ubigrams(words):
  uw = {}
  prevw = "<pw>"
  for w in words:
    uw[prevw + '_' + w] = 1
    prevw = w
  return [w for w in uw]

def query_doc_overlap(qwords, dwords, idf, max_idf):
    # % Query words in doc.
    qwords_in_doc = 0
    idf_qwords_in_doc = 0.0
    idf_qwords = 0.0
    for qword in uwords(qwords):
      idf_qwords += idf_val(qword, idf, max_idf)
      for dword in uwords(dwords):
        if qword == dword:
          idf_qwords_in_doc += idf_val(qword, idf, max_idf)
          qwords_in_doc     += 1
          break
    if len(qwords) <= 0:
      qwords_in_doc_val = 0.0
    else:
      qwords_in_doc_val = (float(qwords_in_doc) / float(len(uwords(qwords))))
    if idf_qwords <= 0.0:
      idf_qwords_in_doc_val = 0.0
    else:
      idf_qwords_in_doc_val = float(idf_qwords_in_doc) / float(idf_qwords)
    # % Query bigrams  in doc.
    qwords_bigrams_in_doc = 0
    idf_qwords_bigrams_in_doc = 0.0
    idf_bigrams = 0.0
    for qword in ubigrams(qwords):
      wrds = qword.split('_')
      idf_bigrams += idf_val(wrds[0], idf, max_idf) * idf_val(wrds[1], idf, max_idf)
      for dword in ubigrams(dwords):
        if qword == dword:
          qwords_bigrams_in_doc += 1
          idf_qwords_bigrams_in_doc += (idf_val(wrds[0], idf, max_idf) * idf_val(wrds[1], idf, max_idf))
          break
    if len(qwords) <= 0:
      qwords_bigrams_in_doc_val = 0.0
    else:
      qwords_bigrams_in_doc_val = (float(qwords_bigrams_in_doc) / float(len(ubigrams(qwords))))
    if idf_bigrams <= 0.0:
      idf_qwords_bigrams_in_doc_val = 0.0
    else:
      idf_qwords_bigrams_in_doc_val = (float(idf_qwords_bigrams_in_doc) / float(idf_bigrams))
    return [
        qwords_in_doc_val,
        qwords_bigrams_in_doc_val,
        idf_qwords_in_doc_val,
        idf_qwords_bigrams_in_doc_val
    ]

def GetScores(qtext, dtext, bm25, idf, max_idf):
    qwords, qw2 = get_words(qtext, idf, max_idf)
    dwords, dw2 = get_words(dtext, idf, max_idf)
    qd1         = query_doc_overlap(qwords, dwords, idf, max_idf)
    bm25        = [bm25]
    return qd1[0:3] + bm25

def GetWords(data, doc_text, words):
  for i in range(len(data['queries'])):
    qwds = tokenize(data['queries'][i]['query_text'])
    for w in qwds:
      words[w] = 1
    for j in range(len(data['queries'][i]['retrieved_documents'])):
      doc_id = data['queries'][i]['retrieved_documents'][j]['doc_id']
      dtext = (
              doc_text[doc_id]['title'] + ' <title> ' + doc_text[doc_id]['abstractText']
              # +
              # ' '.join(
              #     [
              #         ' '.join(mm) for mm in
              #         get_the_mesh(doc_text[doc_id])
              #     ]
              # )
      )
      dwds = tokenize(dtext)
      for w in dwds:
        words[w] = 1

def prep_extracted_snippets(extracted_snippets, docs, qid, top10docs, quest_body):
    ret = {
        'body'      : quest_body,
        'documents' : top10docs,
        'id'        : qid,
        'snippets'  : [],
    }
    for esnip in extracted_snippets:
        pid         = esnip[2].split('/')[-1]
        the_text    = esnip[3]
        esnip_res = {
            # 'score'     : esnip[1],
            "document"  : "http://www.ncbi.nlm.nih.gov/pubmed/{}".format(pid),
            "text"      : the_text
        }
        try:
            ind_from                            = docs[pid]['title'].index(the_text)
            ind_to                              = ind_from + len(the_text)
            esnip_res["beginSection"]           = "title"
            esnip_res["endSection"]             = "title"
            esnip_res["offsetInBeginSection"]   = ind_from
            esnip_res["offsetInEndSection"]     = ind_to
        except:
            # print(the_text)
            # pprint(docs[pid])
            ind_from                            = docs[pid]['abstractText'].index(the_text)
            ind_to                              = ind_from + len(the_text)
            esnip_res["beginSection"]           = "abstract"
            esnip_res["endSection"]             = "abstract"
            esnip_res["offsetInBeginSection"]   = ind_from
            esnip_res["offsetInEndSection"]     = ind_to
        ret['snippets'].append(esnip_res)
    return ret

def get_snips(quest_id, gid, bioasq6_data):
    good_snips = []
    if('snippets' in bioasq6_data[quest_id]):
        for sn in bioasq6_data[quest_id]['snippets']:
            if(sn['document'].endswith(gid)):
                good_snips.extend(sent_tokenize(sn['text']))
    return good_snips

def get_the_mesh(the_doc):
    good_meshes = []
    if('meshHeadingsList' in the_doc):
        for t in the_doc['meshHeadingsList']:
            t = t.split(':', 1)
            t = t[1].strip()
            t = t.lower()
            good_meshes.append(t)
    elif('MeshHeadings' in the_doc):
        for mesh_head_set in the_doc['MeshHeadings']:
            for item in mesh_head_set:
                good_meshes.append(item['text'].strip().lower())
    if('Chemicals' in the_doc):
        for t in the_doc['Chemicals']:
            t = t['NameOfSubstance'].strip().lower()
            good_meshes.append(t)
    good_mesh = sorted(good_meshes)
    good_mesh = ['mesh'] + good_mesh
    # good_mesh = ' # '.join(good_mesh)
    # good_mesh = good_mesh.split()
    # good_mesh = [gm.split() for gm in good_mesh]
    good_mesh = [gm for gm in good_mesh]
    return good_mesh

def snip_is_relevant(one_sent, gold_snips):
    # print one_sent
    # pprint(gold_snips)
    return int(
        any(
            [
                (one_sent.encode('ascii', 'ignore')  in gold_snip.encode('ascii','ignore'))
                or
                (gold_snip.encode('ascii', 'ignore') in one_sent.encode('ascii','ignore'))
                for gold_snip in gold_snips
            ]
        )
    )

def prep_data(quest, the_doc, the_bm25, wv, good_snips, idf, max_idf, use_sent_tokenizer):
    if(emit_only_abstract_sents):
        good_sents = sent_tokenize(the_doc['abstractText'])
    else:
        good_sents      = sent_tokenize(the_doc['title']) + sent_tokenize(the_doc['abstractText'])
    ####
    quest_toks      = tokenize(quest)
    good_doc_af     = GetScores(quest, the_doc['title'] + the_doc['abstractText'], the_bm25, idf, max_idf)
    good_doc_af.append(len(good_sents) / 60.)
    #
    doc_toks            = tokenize(the_doc['title'] + the_doc['abstractText'])
    tomi                = (set(doc_toks) & set(quest_toks))
    tomi_no_stop        = tomi - set(stopwords)
    BM25score           = similarity_score(quest_toks, doc_toks, 1.2, 0.75, idf, avgdl, True, mean, deviation, max_idf)
    tomi_no_stop_idfs   = [idf_val(w, idf, max_idf) for w in tomi_no_stop]
    tomi_idfs           = [idf_val(w, idf, max_idf) for w in tomi]
    quest_idfs          = [idf_val(w, idf, max_idf) for w in quest_toks]
    features            = [
        len(quest)                                      / 300.,
        len(the_doc['title'] + the_doc['abstractText']) / 300.,
        len(tomi_no_stop)                               / 100.,
        BM25score,
        sum(tomi_no_stop_idfs)                          / 100.,
        sum(tomi_idfs)                                  / sum(quest_idfs),
    ]
    good_doc_af.extend(features)
    ####
    good_sents_embeds, good_sents_escores, held_out_sents, good_sent_tags = [], [], [], []
    for good_text in good_sents:
        sent_toks                   = tokenize(good_text)
        good_tokens, good_embeds    = get_embeds(sent_toks, wv)
        good_escores                = GetScores(quest, good_text, the_bm25, idf, max_idf)[:-1]
        good_escores.append(len(sent_toks)/ 342.)
        if (len(good_embeds) > 0):
            #
            tomi                = (set(sent_toks) & set(quest_toks))
            tomi_no_stop        = tomi - set(stopwords)
            BM25score           = similarity_score(quest_toks, sent_toks, 1.2, 0.75, idf, avgdl, True, mean, deviation, max_idf)
            tomi_no_stop_idfs   = [idf_val(w, idf, max_idf) for w in tomi_no_stop]
            tomi_idfs           = [idf_val(w, idf, max_idf) for w in tomi]
            quest_idfs          = [idf_val(w, idf, max_idf) for w in quest_toks]
            features            = [
                len(quest)              / 300.,
                len(good_text)          / 300.,
                len(tomi_no_stop)       / 100.,
                BM25score,
                sum(tomi_no_stop_idfs)  / 100.,
                sum(tomi_idfs)          / sum(quest_idfs),
            ]
            #
            good_sents_embeds.append(good_embeds)
            good_sents_escores.append(good_escores+features)
            held_out_sents.append(good_text)
            good_sent_tags.append(snip_is_relevant(' '.join(bioclean(good_text)), good_snips))
    ####
    return {
        'sents_embeds'     : good_sents_embeds,
        'sents_escores'    : good_sents_escores,
        'doc_af'           : good_doc_af,
        'sent_tags'        : good_sent_tags,
        'held_out_sents'   : held_out_sents,
    }

def do_for_one_retrieved(doc_emit_, gs_emits_, held_out_sents, retr, doc_res, gold_snips):
    emition                 = doc_emit_.cpu().item()
    emitss                  = gs_emits_.tolist()
    mmax                    = max(emitss)
    all_emits, extracted_from_one = [], []
    for ind in range(len(emitss)):
        t = (
            snip_is_relevant(held_out_sents[ind], gold_snips),
            emitss[ind],
            "http://www.ncbi.nlm.nih.gov/pubmed/{}".format(retr['doc_id']),
            held_out_sents[ind]
        )
        all_emits.append(t)
        # extracted_from_one.append(t)
        # if(emitss[ind] == mmax):
        #     extracted_from_one.append(t)
        if(emitss[ind]> min_sent_score):
            extracted_from_one.append(t)
    doc_res[retr['doc_id']] = float(emition)
    all_emits               = sorted(all_emits, key=lambda x: x[1], reverse=True)
    # return only top K
    extracted_from_one      = sorted([t for t in extracted_from_one if len(t[3])>30], key=lambda x:x[1], reverse=True)[:2]
    return doc_res, extracted_from_one, all_emits

def get_norm_doc_scores(the_doc_scores):
    ks = list(the_doc_scores.keys())
    vs = [the_doc_scores[k] for k in ks]
    vs = softmax(vs)
    norm_doc_scores = {}
    for i in range(len(ks)):
        norm_doc_scores[ks[i]] = vs[i]
    return norm_doc_scores

def select_snippets_v1(extracted_snippets):
    '''
    :param extracted_snippets:
    :param doc_res:
    :return: returns the best 10 snippets of all docs (0..n from each doc)
    '''
    sorted_snips = sorted(extracted_snippets, key=lambda x: x[1], reverse=True)
    return sorted_snips[:10]

def select_snippets_v2(extracted_snippets):
    '''
    :param extracted_snippets:
    :param doc_res:
    :return: returns the best snippet of each doc  (1 from each doc)
    '''
    # is_relevant, the_sent_score, ncbi_pmid_link, the_actual_sent_text
    ret                 = {}
    for es in extracted_snippets:
        if(es[2] in ret):
            if(es[1] > ret[es[2]][1]):
                ret[es[2]] = es
        else:
            ret[es[2]] = es
    sorted_snips =  sorted(ret.values(), key=lambda x: x[1], reverse=True)
    return sorted_snips[:10]

def select_snippets_v3(extracted_snippets, the_doc_scores):
    '''
    :param      extracted_snippets:
    :param      doc_res:
    :return:    returns the top 10 snippets across all documents (0..n from each doc)
    '''
    norm_doc_scores     = get_norm_doc_scores(the_doc_scores)
    # is_relevant, the_sent_score, ncbi_pmid_link, the_actual_sent_text
    extracted_snippets  = [tt for tt in extracted_snippets if (tt[2] in norm_doc_scores)]
    sorted_snips        = sorted(extracted_snippets, key=lambda x: x[1] * norm_doc_scores[x[2]], reverse=True)
    return sorted_snips[:10]

def do_for_some_retrieved(docs, dato, retr_docs, data_for_revision, ret_data, use_sent_tokenizer):
    emitions                    = {
        'body': dato['query_text'],
        'id': dato['query_id'],
        'documents': []
    }
    #
    quest_text                  = dato['query_text']
    quest_tokens, quest_embeds  = get_embeds(tokenize(quest_text), wv)
    q_idfs                      = np.array([[idf_val(qw, idf, max_idf)] for qw in quest_tokens], 'float')
    #
    doc_res, extracted_snippets         = {}, []
    for retr in retr_docs:
        datum                   = prep_data(quest_text, docs[retr['doc_id']], retr['norm_bm25_score'], wv, [], idf, max_idf, use_sent_tokenizer=use_sent_tokenizer)
        doc_emit_, gs_emits_    = model.emit_one(
            doc1_sents_embeds   = datum['sents_embeds'],
            question_embeds     = quest_embeds,
            q_idfs              = q_idfs,
            sents_gaf           = datum['sents_escores'],
            doc_gaf             = datum['doc_af']
        )
        doc_res, extracted_from_one, all_emits = do_for_one_retrieved(doc_emit_, gs_emits_, datum['held_out_sents'], retr, doc_res, [])
        # is_relevant, the_sent_score, ncbi_pmid_link, the_actual_sent_text
        extracted_snippets.extend(extracted_from_one)
        #
        if (dato['query_id'] not in data_for_revision):
            data_for_revision[dato['query_id']] = {
                'query_text': dato['query_text'],
                'snippets'  : {
                    retr['doc_id']: all_emits
                }
            }
        else:
            data_for_revision[dato['query_id']]['snippets'][retr['doc_id']] = all_emits
    #
    data_for_revision[dato['query_id']]['doc_scores'] = doc_res
    doc_res                                 = sorted([item for item in doc_res.items() if(item[1]>min_doc_score)], key = lambda x: x[1], reverse = True)
    the_doc_scores                          = dict([("http://www.ncbi.nlm.nih.gov/pubmed/{}".format(pm[0]), pm[1]) for pm in doc_res[:10]])
    doc_res                                 = ["http://www.ncbi.nlm.nih.gov/pubmed/{}".format(pm[0]) for pm in doc_res]
    emitions['documents']                   = doc_res[:100]
    ret_data['questions'].append(emitions)
    #
    extracted_snippets                      = [tt for tt in extracted_snippets if (tt[2] in doc_res[:10])]
    #
    if(use_sent_tokenizer):
        extracted_snippets_v1               = select_snippets_v1(extracted_snippets)
        extracted_snippets_v2               = select_snippets_v2(extracted_snippets)
        extracted_snippets_v3               = select_snippets_v3(extracted_snippets, the_doc_scores)
    else:
        extracted_snippets_v1, extracted_snippets_v2, extracted_snippets_v3 = [], [], []
    #
    snips_res_v1                = prep_extracted_snippets(extracted_snippets_v1, docs, dato['query_id'], doc_res[:10], dato['query_text'])
    snips_res_v2                = prep_extracted_snippets(extracted_snippets_v2, docs, dato['query_id'], doc_res[:10], dato['query_text'])
    snips_res_v3                = prep_extracted_snippets(extracted_snippets_v3, docs, dato['query_id'], doc_res[:10], dato['query_text'])
    #
    snips_res = {
        'v1' : snips_res_v1,
        'v2' : snips_res_v2,
        'v3' : snips_res_v3,
    }
    return data_for_revision, ret_data, snips_res

def get_bioasq_res(prefix, data_emitted, data_for_revision):
    #
    fgold   = '{}_data_for_revision.json'.format(prefix)
    fgold   = os.path.join(odir, fgold)
    fgold   = os.path.abspath(fgold)
    with open(fgold, 'w') as f:
        f.write(json.dumps(data_for_revision, indent=4, sort_keys=True))
        f.close()
    #
    femit    = '{}_emit_bioasq.json'.format(prefix)
    femit   = os.path.join(odir, femit)
    femit   = os.path.abspath(femit)
    with open(femit, 'w') as f:
        f.write(json.dumps(data_emitted, indent=4, sort_keys=True))
        f.close()
    #

def get_one_map(prefix, data, docs, use_sent_tokenizer):
    model.eval()
    #
    ret_data                        = {'questions': []}
    all_bioasq_subm_data_v1         = {"questions": []}
    all_bioasq_subm_data_v2         = {"questions": []}
    all_bioasq_subm_data_v3         = {"questions": []}
    data_for_revision               = {}
    #
    for dato in tqdm(data['queries']):
        data_for_revision, ret_data, snips_res = do_for_some_retrieved(docs, dato, dato['retrieved_documents'], data_for_revision, ret_data, use_sent_tokenizer)
        all_bioasq_subm_data_v1['questions'].append(snips_res['v1'])
        all_bioasq_subm_data_v2['questions'].append(snips_res['v2'])
        all_bioasq_subm_data_v3['questions'].append(snips_res['v3'])
    #
    get_bioasq_res('v1 '+prefix, all_bioasq_subm_data_v1, data_for_revision)
    get_bioasq_res('v2 '+prefix, all_bioasq_subm_data_v2, data_for_revision)
    get_bioasq_res('v3 '+prefix, all_bioasq_subm_data_v3, data_for_revision)
    #
    if (prefix == 'dev'):
        with open(os.path.join(odir, 'elk_relevant_abs_posit_drmm_lists_dev.json'), 'w') as f:
            f.write(json.dumps(ret_data, indent=4, sort_keys=True))
    else:
        with open(os.path.join(odir, 'elk_relevant_abs_posit_drmm_lists_test.json'), 'w') as f:
            f.write(json.dumps(ret_data, indent=4, sort_keys=True))
    return None

def fix2(qtext):
    qtext = qtext.lower()
    if(qtext.startswith('can ')):
        qtext = qtext[4:]
    if(qtext.startswith('list the ')):
        qtext = qtext[9:]
    if(qtext.startswith('list ')):
        qtext = qtext[5:]
    if(qtext.startswith('describe the ')):
        qtext = qtext[13:]
    if(qtext.startswith('describe ')):
        qtext = qtext[9:]
    if('list as many ' in qtext and 'as possible' in qtext):
        qtext = qtext.replace('list as many ', '')
        qtext = qtext.replace('as possible', '')
    if('yes or no' in qtext):
        qtext = qtext.replace('yes or no', '')
    if('also known as' in qtext):
        qtext = qtext.replace('also known as', '')
    if('is used to ' in qtext):
        qtext = qtext.replace('is used to ', '')
    if('are used to ' in qtext):
        qtext = qtext.replace('are used to ', '')
    tokenized_body  = [t for t in qtext.split() if t not in stopwords]
    tokenized_body  = bioclean_mod(' '.join(tokenized_body))
    question        = ' '.join(tokenized_body)
    return question

def fix1(qtext):
    tokenized_body  = bioclean_mod(qtext)
    tokenized_body  = [t for t in tokenized_body if t not in stopwords]
    question        = ' '.join(tokenized_body)
    return question

def get_first_n_1(qtext, n, max_year=2022, expand=False):
    # tokenized_body  = bioclean_mod(qtext)
    # tokenized_body  = [t for t in tokenized_body if t not in stopwords]
    # question        = ' '.join(tokenized_body)
    question = fix2(qtext)
    print(question)
    ################################################
    bod             = {
        "size": n,
        "query": {
            "bool": {
                "must": [{"range": {"DateCompleted": {"gte": "1900", "lte": str(max_year), "format": "dd/MM/yyyy||yyyy"}}}],
                "should": [
                    {
                        "match": {
                            "joint_text": {
                                "query": question,
                                "boost": 1,
                                'minimum_should_match': "30%"
                            }
                        }
                    },
                    {
                        "match": {
                            "joint_text": {
                                "query": question,
                                "boost": 1,
                                'minimum_should_match': "50%"
                            }
                        }
                    },
                    {
                        "match": {
                            "joint_text": {
                                "query": question,
                                "boost": 1,
                                'minimum_should_match': "70%"
                            }
                        }
                    },
                    {"match_phrase": {"joint_text": {"query": question, "boost": 1}}}
                ],
                "minimum_should_match": 1,
            }
        }
    }
    ################################################
    if expand:
        for token in set(question.split()):
            for similar_set in similar_sets:
                if token in similar_set:
                    bod['query']['bool']['should'].append(
                        {
                            "match": {
                                "joint_text": {
                                    "query": ' '.join(list(similar_set)),
                                    "boost": 1,
                                    'minimum_should_match': "{}%".format(int(100/len(similar_set)))
                                }
                            }
                        }
                    )
    ################################################
    res             = es.search(index=doc_index, body=bod, request_timeout=120)
    return res['hits']['hits']

class Sent_Posit_Drmm_Modeler(nn.Module):
    def __init__(self,
             embedding_dim          = 30,
             k_for_maxpool          = 5,
             sentence_out_method    = 'MLP',
             k_sent_maxpool         = 1
         ):
        super(Sent_Posit_Drmm_Modeler, self).__init__()
        self.k                                      = k_for_maxpool
        self.k_sent_maxpool                         = k_sent_maxpool
        if(use_sent_extra):
            self.sent_add_feats = 10
        else:
            self.sent_add_feats = 0
        if(use_doc_extra):
            self.doc_add_feats  = 11
        else:
            self.doc_add_feats  = 0
        #
        self.embedding_dim                          = embedding_dim
        self.sentence_out_method                    = sentence_out_method
        # to create q weights
        self.init_context_module()
        self.init_question_weight_module()
        self.init_mlps_for_pooled_attention()
        self.init_sent_output_layer()
        self.init_doc_out_layer()
        # doc loss func
        self.margin_loss        = nn.MarginRankingLoss(margin=1.0)
        if(use_cuda):
            self.margin_loss    = self.margin_loss.cuda()
    def init_mesh_module(self):
        self.mesh_h0    = autograd.Variable(torch.randn(1, 1, self.embedding_dim))
        self.mesh_gru   = nn.GRU(self.embedding_dim, self.embedding_dim)
        if(use_cuda):
            self.mesh_h0    = self.mesh_h0.cuda()
            self.mesh_gru   = self.mesh_gru.cuda()
    def init_context_module(self):
        self.trigram_conv_1             = nn.Conv1d(self.embedding_dim, self.embedding_dim, 3, padding=2, bias=True)
        # self.trigram_conv_activation_1  = torch.nn.LeakyReLU(negative_slope=0.1)
        self.trigram_conv_activation_1 = torch.nn.Sigmoid()
        self.trigram_conv_2             = nn.Conv1d(self.embedding_dim, self.embedding_dim, 3, padding=2, bias=True)
        # self.trigram_conv_activation_2  = torch.nn.LeakyReLU(negative_slope=0.1)
        self.trigram_conv_activation_2 = torch.nn.Sigmoid()
        if(use_cuda):
            self.trigram_conv_1             = self.trigram_conv_1.cuda()
            self.trigram_conv_2             = self.trigram_conv_2.cuda()
            self.trigram_conv_activation_1  = self.trigram_conv_activation_1.cuda()
            self.trigram_conv_activation_2  = self.trigram_conv_activation_2.cuda()
    def init_question_weight_module(self):
        self.q_weights_mlp      = nn.Linear(self.embedding_dim+1, 1, bias=True)
        if(use_cuda):
            self.q_weights_mlp  = self.q_weights_mlp.cuda()
    def init_mlps_for_pooled_attention(self):
        how_many = 0
        if(use_W2V_sim):
            how_many += 1
        if(use_OH_sim):
            how_many += 1
        if(use_context_sim):
            how_many += 1
        self.linear_per_q1      = nn.Linear(how_many * 3, 8, bias=True)
        self.my_relu1           = torch.nn.LeakyReLU(negative_slope=0.1)
        self.linear_per_q2      = nn.Linear(8, 1, bias=True)
        if(use_cuda):
            self.linear_per_q1  = self.linear_per_q1.cuda()
            self.linear_per_q2  = self.linear_per_q2.cuda()
            self.my_relu1       = self.my_relu1.cuda()
    def init_sent_output_layer(self):
        if(self.sentence_out_method == 'MLP'):
            self.sent_out_layer_1       = nn.Linear(self.sent_add_feats+1, 8, bias=False)
            self.sent_out_activ_1       = torch.nn.LeakyReLU(negative_slope=0.1)
            self.sent_out_layer_2       = nn.Linear(8, 1, bias=False)
            if(use_cuda):
                self.sent_out_layer_1   = self.sent_out_layer_1.cuda()
                self.sent_out_activ_1   = self.sent_out_activ_1.cuda()
                self.sent_out_layer_2   = self.sent_out_layer_2.cuda()
        else:
            self.sent_res_h0    = autograd.Variable(torch.randn(2, 1, 5))
            self.sent_res_bigru = nn.GRU(input_size=self.sent_add_feats+1, hidden_size=5, bidirectional=True, batch_first=False)
            self.sent_res_mlp   = nn.Linear(10, 1, bias=False)
            if(use_cuda):
                self.sent_res_h0    = self.sent_res_h0.cuda()
                self.sent_res_bigru = self.sent_res_bigru.cuda()
                self.sent_res_mlp   = self.sent_res_mlp.cuda()
    def init_doc_out_layer(self):
        self.final_layer_1 = nn.Linear(self.doc_add_feats+self.k_sent_maxpool, 8, bias=True)
        self.final_activ_1  = torch.nn.LeakyReLU(negative_slope=0.1)
        self.final_layer_2  = nn.Linear(8, 1, bias=True)
        self.oo_layer       = nn.Linear(2, 1, bias=True)
        if(use_cuda):
            self.final_layer_1  = self.final_layer_1.cuda()
            self.final_activ_1  = self.final_activ_1.cuda()
            self.final_layer_2  = self.final_layer_2.cuda()
            self.oo_layer       = self.oo_layer.cuda()
    def my_hinge_loss(self, positives, negatives, margin=1.0):
        delta      = negatives - positives
        loss_q_pos = torch.sum(F.relu(margin + delta), dim=-1)
        return loss_q_pos
    def apply_context_gru(self, the_input, h0):
        output, hn      = self.context_gru(the_input.unsqueeze(1), h0)
        output          = self.context_gru_activation(output)
        out_forward     = output[:, 0, :self.embedding_dim]
        out_backward    = output[:, 0, self.embedding_dim:]
        output          = out_forward + out_backward
        res             = output + the_input
        return res, hn
    def apply_context_convolution(self, the_input, the_filters, activation):
        conv_res        = the_filters(the_input.transpose(0,1).unsqueeze(0))
        if(activation is not None):
            conv_res    = activation(conv_res)
        pad             = the_filters.padding[0]
        ind_from        = int(np.floor(pad/2.0))
        ind_to          = ind_from + the_input.size(0)
        conv_res        = conv_res[:, :, ind_from:ind_to]
        conv_res        = conv_res.transpose(1, 2)
        # residual
        conv_res = conv_res + the_input
        return conv_res.squeeze(0)
    def my_cosine_sim(self, A, B):
        A           = A.unsqueeze(0)
        B           = B.unsqueeze(0)
        A_mag       = torch.norm(A, 2, dim=2)
        B_mag       = torch.norm(B, 2, dim=2)
        num         = torch.bmm(A, B.transpose(-1,-2))
        den         = torch.bmm(A_mag.unsqueeze(-1), B_mag.unsqueeze(-1).transpose(-1,-2))
        dist_mat    = num / den
        return dist_mat
    def pooling_method(self, sim_matrix):
        sorted_res              = torch.sort(sim_matrix, -1)[0]                             # sort the input minimum to maximum
        k_max_pooled            = sorted_res[:,-self.k:]                                    # select the last k of each instance in our data
        average_k_max_pooled    = k_max_pooled.sum(-1)/float(self.k)                        # average these k values
        the_maximum             = k_max_pooled[:, -1]                                       # select the maximum value of each instance
        the_average_over_all    = sorted_res.sum(-1)/float(sim_matrix.size(1))              # add average of all elements as long sentences might have more matches
        the_concatenation       = torch.stack([the_maximum, average_k_max_pooled, the_average_over_all], dim=-1)  # concatenate maximum value and average of k-max values
        return the_concatenation     # return the concatenation
    def get_output(self, input_list, weights):
        temp    = torch.cat(input_list, -1)
        lo      = self.linear_per_q1(temp)
        lo      = self.my_relu1(lo)
        lo      = self.linear_per_q2(lo)
        lo      = lo.squeeze(-1)
        lo      = lo * weights
        sr      = lo.sum(-1) / lo.size(-1)
        return sr
    def apply_sent_res_bigru(self, the_input):
        output, hn      = self.sent_res_bigru(the_input.unsqueeze(1), self.sent_res_h0)
        output          = self.sent_res_mlp(output)
        return output.squeeze(-1).squeeze(-1)
    def do_for_one_doc_cnn(self, doc_sents_embeds, sents_af, question_embeds, q_conv_res_trigram, q_weights, k2):
        res = []
        for i in range(len(doc_sents_embeds)):
            sent_embeds         = autograd.Variable(torch.FloatTensor(doc_sents_embeds[i]), requires_grad=False)
            gaf                 = autograd.Variable(torch.FloatTensor(sents_af[i]), requires_grad=False)
            if(use_cuda):
                sent_embeds     = sent_embeds.cuda()
                gaf             = gaf.cuda()
            conv_res            = self.apply_context_convolution(sent_embeds,   self.trigram_conv_1, self.trigram_conv_activation_1)
            conv_res            = self.apply_context_convolution(conv_res,      self.trigram_conv_2, self.trigram_conv_activation_2)
            #
            sim_insens          = self.my_cosine_sim(question_embeds, sent_embeds).squeeze(0)
            sim_oh              = (sim_insens > (1 - (1e-3))).float()
            sim_sens            = self.my_cosine_sim(q_conv_res_trigram, conv_res).squeeze(0)
            #
            insensitive_pooled  = self.pooling_method(sim_insens)
            sensitive_pooled    = self.pooling_method(sim_sens)
            oh_pooled           = self.pooling_method(sim_oh)
            #
            the_inputs          = []
            if(use_OH_sim):
                the_inputs.append(oh_pooled)
            if(use_W2V_sim):
                the_inputs.append(insensitive_pooled)
            if(use_context_sim):
                the_inputs.append(sensitive_pooled)
            sent_emit           = self.get_output(the_inputs, q_weights)
            if(use_sent_extra):
                sent_add_feats = torch.cat([gaf, sent_emit.unsqueeze(-1)])
            else:
                sent_add_feats = torch.cat([sent_emit.unsqueeze(-1)])
            res.append(sent_add_feats)
        res = torch.stack(res)
        if(self.sentence_out_method == 'MLP'):
            res = self.sent_out_layer_1(res)
            res = self.sent_out_activ_1(res)
            res = self.sent_out_layer_2(res).squeeze(-1)
        else:
            res = self.apply_sent_res_bigru(res)
        # ret = self.get_max(res).unsqueeze(0)
        ret = self.get_kmax(res, k2)
        return ret, res
    def do_for_one_doc_bigru(self, doc_sents_embeds, sents_af, question_embeds, q_conv_res_trigram, q_weights, k2):
        res = []
        hn  = self.context_h0
        for i in range(len(doc_sents_embeds)):
            sent_embeds         = autograd.Variable(torch.FloatTensor(doc_sents_embeds[i]), requires_grad=False)
            gaf                 = autograd.Variable(torch.FloatTensor(sents_af[i]), requires_grad=False)
            if(use_cuda):
                sent_embeds     = sent_embeds.cuda()
                gaf             = gaf.cuda()
            conv_res, hn        = self.apply_context_gru(sent_embeds, hn)
            #
            sim_insens          = self.my_cosine_sim(question_embeds, sent_embeds).squeeze(0)
            sim_oh              = (sim_insens > (1 - (1e-3))).float()
            sim_sens            = self.my_cosine_sim(q_conv_res_trigram, conv_res).squeeze(0)
            #
            insensitive_pooled  = self.pooling_method(sim_insens)
            sensitive_pooled    = self.pooling_method(sim_sens)
            oh_pooled           = self.pooling_method(sim_oh)
            #
            sent_emit           = self.get_output([oh_pooled, insensitive_pooled, sensitive_pooled], q_weights)
            sent_add_feats      = torch.cat([gaf, sent_emit.unsqueeze(-1)])
            res.append(sent_add_feats)
        res = torch.stack(res)
        if(self.sentence_out_method == 'MLP'):
            res = self.sent_out_layer_1(res)
            res = self.sent_out_activ_1(res)
            res = self.sent_out_layer_2(res).squeeze(-1)
        else:
            res = self.apply_sent_res_bigru(res)
        # ret = self.get_max(res).unsqueeze(0)
        ret = self.get_kmax(res, k2)
        res = torch.sigmoid(res)
        return ret, res
    def get_max(self, res):
        return torch.max(res)
    def get_kmax(self, res, k):
        res     = torch.sort(res,0)[0]
        res     = res[-k:].squeeze(-1)
        if(len(res.size())==0):
            res = res.unsqueeze(0)
        if(res.size()[0] < k):
            to_concat       = torch.zeros(k - res.size()[0])
            if(use_cuda):
                to_concat   = to_concat.cuda()
            res             = torch.cat([res, to_concat], -1)
        return res
    def get_max_and_average_of_k_max(self, res, k):
        k_max_pooled            = self.get_kmax(res, k)
        average_k_max_pooled    = k_max_pooled.sum()/float(k)
        the_maximum             = k_max_pooled[-1]
        the_concatenation       = torch.cat([the_maximum, average_k_max_pooled.unsqueeze(0)])
        return the_concatenation
    def get_average(self, res):
        res = torch.sum(res) / float(res.size()[0])
        return res
    def get_maxmin_max(self, res):
        res = self.min_max_norm(res)
        res = torch.max(res)
        return res
    def apply_mesh_gru(self, mesh_embeds):
        mesh_embeds             = autograd.Variable(torch.FloatTensor(mesh_embeds), requires_grad=False)
        if(use_cuda):
            mesh_embeds         = mesh_embeds.cuda()
        output, hn              = self.mesh_gru(mesh_embeds.unsqueeze(1), self.mesh_h0)
        return output[-1,0,:]
    def get_mesh_rep(self, meshes_embeds, q_context):
        meshes_embeds   = [self.apply_mesh_gru(mesh_embeds) for mesh_embeds in meshes_embeds]
        meshes_embeds   = torch.stack(meshes_embeds)
        sim_matrix      = self.my_cosine_sim(meshes_embeds, q_context).squeeze(0)
        max_sim         = torch.sort(sim_matrix, -1)[0][:, -1]
        output          = torch.mm(max_sim.unsqueeze(0), meshes_embeds)[0]
        return output
    def emit_one(self, doc1_sents_embeds, question_embeds, q_idfs, sents_gaf, doc_gaf):
        q_idfs              = autograd.Variable(torch.FloatTensor(q_idfs),              requires_grad=False)
        question_embeds     = autograd.Variable(torch.FloatTensor(question_embeds),     requires_grad=False)
        doc_gaf             = autograd.Variable(torch.FloatTensor(doc_gaf),             requires_grad=False)
        if(use_cuda):
            q_idfs          = q_idfs.cuda()
            question_embeds = question_embeds.cuda()
            doc_gaf         = doc_gaf.cuda()
        #
        q_context           = self.apply_context_convolution(question_embeds,   self.trigram_conv_1, self.trigram_conv_activation_1)
        q_context           = self.apply_context_convolution(q_context,         self.trigram_conv_2, self.trigram_conv_activation_2)
        #
        q_weights           = torch.cat([q_context, q_idfs], -1)
        q_weights           = self.q_weights_mlp(q_weights).squeeze(-1)
        q_weights           = F.softmax(q_weights, dim=-1)
        #
        good_out, gs_emits  = self.do_for_one_doc_cnn(doc1_sents_embeds, sents_gaf, question_embeds, q_context, q_weights, self.k_sent_maxpool)
        #
        if(use_doc_extra):
            good_out_pp     = torch.cat([good_out, doc_gaf], -1)
        else:
            good_out_pp     = torch.cat([good_out], -1)
        #
        final_good_output   = self.final_layer_1(good_out_pp)
        final_good_output   = self.final_activ_1(final_good_output)
        final_good_output   = self.final_layer_2(final_good_output)
        #
        if(use_last_layer):
            gs_emits            = gs_emits.unsqueeze(-1)
            gs_emits            = torch.cat([gs_emits, final_good_output.unsqueeze(-1).expand_as(gs_emits)], -1)
            gs_emits            = self.oo_layer(gs_emits).squeeze(-1)
            gs_emits            = torch.sigmoid(gs_emits)
        else:
            gs_emits            = torch.sigmoid(gs_emits)
        #
        return final_good_output, gs_emits
    def forward(self, doc1_sents_embeds, doc2_sents_embeds, question_embeds, q_idfs, sents_gaf, sents_baf, doc_gaf, doc_baf):
        q_idfs              = autograd.Variable(torch.FloatTensor(q_idfs),              requires_grad=False)
        question_embeds     = autograd.Variable(torch.FloatTensor(question_embeds),     requires_grad=False)
        doc_gaf             = autograd.Variable(torch.FloatTensor(doc_gaf),             requires_grad=False)
        doc_baf             = autograd.Variable(torch.FloatTensor(doc_baf),             requires_grad=False)
        if(use_cuda):
            q_idfs          = q_idfs.cuda()
            question_embeds = question_embeds.cuda()
            doc_gaf         = doc_gaf.cuda()
            doc_baf         = doc_baf.cuda()
        #
        q_context           = self.apply_context_convolution(question_embeds,   self.trigram_conv_1, self.trigram_conv_activation_1)
        q_context           = self.apply_context_convolution(q_context,         self.trigram_conv_2, self.trigram_conv_activation_2)
        #
        q_weights           = torch.cat([q_context, q_idfs], -1)
        q_weights           = self.q_weights_mlp(q_weights).squeeze(-1)
        q_weights           = F.softmax(q_weights, dim=-1)
        #
        good_out, gs_emits  = self.do_for_one_doc_cnn(doc1_sents_embeds, sents_gaf, question_embeds, q_context, q_weights, self.k_sent_maxpool)
        bad_out, bs_emits   = self.do_for_one_doc_cnn(doc2_sents_embeds, sents_baf, question_embeds, q_context, q_weights, self.k_sent_maxpool)
        #
        if(use_doc_extra):
            good_out_pp     = torch.cat([good_out, doc_gaf], -1)
            bad_out_pp      = torch.cat([bad_out, doc_baf], -1)
        else:
            good_out_pp     = torch.cat([good_out], -1)
            bad_out_pp      = torch.cat([bad_out], -1)
        #
        final_good_output   = self.final_layer_1(good_out_pp)
        final_good_output   = self.final_activ_1(final_good_output)
        final_good_output   = self.final_layer_2(final_good_output)
        ###################
        final_bad_output    = self.final_layer_1(bad_out_pp)
        final_bad_output    = self.final_activ_1(final_bad_output)
        final_bad_output    = self.final_layer_2(final_bad_output)
        ###################
        if(use_last_layer):
            gs_emits        = gs_emits.unsqueeze(-1)
            gs_emits        = torch.cat([gs_emits, final_good_output.unsqueeze(-1).expand_as(gs_emits)], -1)
            gs_emits        = self.oo_layer(gs_emits).squeeze(-1)
            gs_emits        = torch.sigmoid(gs_emits)
            #####################
            bs_emits            = bs_emits.unsqueeze(-1)
            bs_emits            = torch.cat([bs_emits, final_bad_output.unsqueeze(-1).expand_as(bs_emits)], -1)
            bs_emits            = self.oo_layer(bs_emits).squeeze(-1)
            bs_emits            = torch.sigmoid(bs_emits)
        else:
            gs_emits            = torch.sigmoid(gs_emits)
            bs_emits            = torch.sigmoid(bs_emits)
        loss1               = self.my_hinge_loss(final_good_output, final_bad_output)
        return loss1, final_good_output, final_bad_output, gs_emits, bs_emits

def load_model_from_checkpoint(resume_from):
    global start_epoch, optimizer
    if os.path.isfile(resume_from):
        print("=> loading checkpoint '{}'".format(resume_from))
        checkpoint = torch.load(resume_from, map_location=lambda storage, loc: storage)
        model.load_state_dict(checkpoint['state_dict'])
        print("=> loaded checkpoint '{}' (epoch {})".format(resume_from, checkpoint['epoch']))
    else:
        print('ERROR! 404!!! EXITING!')
        exit()


################################################################################################################|

w2v_bin_path                = '/home/dpappas/bioasq_all/pubmed2018_w2v_30D.bin'
idf_pickle_path             = '/home/dpappas/bioasq_all/idf.pkl'
elk_ips                     = '/home/dpappas/elk_ips.txt'
doc_index                   = 'pubmed_abstracts_joint_0_1'
stopwords_path              = '/home/dpappas/bioasq_all/stopwords.pkl'
odir                        = './temp_dir_retr/'
resume_from                 = '/media/dpappas/dpappas_data/models_out/ablations/ablation_1111111_bioasq_jpdrmm_2L_0p01_run_0/best_dev_checkpoint.pth.tar'

################################################################################################################|


bioclean_mod = lambda t: re.sub(
    '[.,?;*!%^&_+():-\[\]{}]', '',
    t.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').replace("-", ' ').strip().lower()
).split()
bioclean    = lambda t: re.sub('[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').strip().lower()).split()

with open(elk_ips) as fp:
    cluster_ips = [line.strip() for line in fp.readlines() if(len(line.strip())>0)]
    fp.close()

    es = Elasticsearch(cluster_ips, verify_certs=True, timeout=150, max_retries=10, retry_on_timeout=True)

with open(stopwords_path, 'rb') as f:
    stopwords = pickle.load(f)

stopwords.add('with')
print(stopwords)
stopwords.update(nltk.corpus.stopwords.words("english"))

questions       = [
    'Does Rhophilin Rho GTPase Binding Protein 2 (RHPN2) regulate sucrose levels ?',
    'Does Rhophilin Rho GTPase Binding Protein 2 (RHPN2) regulate sucrase-isomaltase expression ?',
    'Does Ras Homolog Family Member A (RhoA) regulate sucrose levels ?',
    'Does Ras Homolog Family Member A (RhoA) regulate sucrase-isomaltase expression ?',
    'What is the mechanism linking Rhophilin Rho GTPase Binding Protein 2 with colorectal cancer ?',
    'What is the mechanism linking Ras Homolog Family Member A (RhoA) with colorectal cancer ?',
    'Does Solute Carrier Family 6 Member 18 regulate amino acid levels in urine ?',
]

test_docs_to_save   = {}
test_data_to_save   = {'queries' : []}
all_retrieved       = {}
for qid, qtext in tqdm(enumerate(questions)):
    print(qtext)
    all_retrieved[qtext] = []
    #######################################################
    results     = get_first_n_1(qtext, 100, expand=True)
    print([t['_id'] for t in results])
    #######################################################
    temp_1      = {
        'num_rel'               : 0,
        'num_rel_ret'           : 0,
        'num_ret'               : -1,
        'query_id'              : qid,
        'query_text'            : qtext,
        'relevant_documents'    : [],
        'retrieved_documents'   : []
    }
    #######################################################
    all_scores          = [res['_score'] for res in results]
    # print(all_scores)
    scaler              = StandardScaler().fit(np.array(all_scores).reshape(-1,1))
    temp_1['num_ret']   = len(all_scores)
    #######################################################
    for res, rank in zip(results, range(1, len(results)+1)):
        test_docs_to_save[res['_id']] = {
            'abstractText'      : res['_source']['joint_text'].split('--------------------', 1)[1].strip(),
            'author'            : '',
            'country'           : '',
            'journalName'       : '',
            'keywords'          : '',
            'meshHeadingsList'  : [],
            'pmid'              : res['_id'],
            'publicationDate'   : res['_source']['DateCompleted'],
            'title'             : res['_source']['joint_text'].split('--------------------')[0].strip()
        }
        #######################################################
        temp_1['retrieved_documents'].append({
                'bm25_score'        : res['_score'],
                'doc_id'            : res['_id'],
                'is_relevant'       : False,
                'norm_bm25_score'   : scaler.transform([[res['_score']]])[0][0],
                'rank'              : rank
            })
        all_retrieved[qtext].append(
            {
                'rank'              : rank,
                'title'             : res['_source']['joint_text'].split('--------------------')[0].strip(),
                'abstractText'      : res['_source']['joint_text'].split('--------------------', 1)[1].strip(),
                'author'            : '',
                'country'           : '',
                'journalName'       : '',
                'keywords'          : '',
                'meshHeadingsList'  : [],
                'pmid'              : res['_id'],
                'publicationDate'   : res['_source']['DateCompleted'],
                'bm25_score'        : res['_score'],
                'norm_bm25_score'   : scaler.transform([[res['_score']]])[0][0]
            }
        )
    test_data_to_save['queries'].append(temp_1)
    # break


if(not os.path.exists(odir)):
    os.makedirs(odir)

pickle.dump(test_data_to_save, open(os.path.join(odir, 'bioasq9_bm25_top100.test.pkl'), 'wb'))
pickle.dump(test_docs_to_save, open(os.path.join(odir, 'bioasq9_bm25_docset_top100.test.pkl'), 'wb'))

with open(os.path.join(odir, 'all_retrieved.json'), 'w', encoding='utf-8') as f:
    f.write(json.dumps(all_retrieved, indent=4, sort_keys=True))
    f.close()

################################################################################################################|

min_doc_score               = -1000.
min_sent_score              = -1000.
emit_only_abstract_sents    = False

use_cuda                    = torch.cuda.is_available()
f_in2                       = os.path.join(odir, 'bioasq9_bm25_top100.test.pkl')
f_in3                       = os.path.join(odir, 'bioasq9_bm25_docset_top100.test.pkl')
use_sent_extra              = True
use_doc_extra               = True
use_OH_sim                  = True
use_W2V_sim                 = True
use_context_sim             = True
use_sent_loss               = True
use_last_layer              = True

avgdl, mean, deviation      = 21.1907, 0.6275, 1.2210
print(avgdl, mean, deviation)

k_for_maxpool               = 5
embedding_dim               = 30 #200

my_seed     = 1
random.seed(my_seed)
torch.manual_seed(my_seed)

print('Compiling model...')
model       = Sent_Posit_Drmm_Modeler(embedding_dim=embedding_dim, k_for_maxpool=k_for_maxpool)
if(use_cuda):
    model   = model.cuda()

load_model_from_checkpoint(resume_from)
params      = model.parameters()
for param in params:
    param.requires_grad = False

print_params(model)

print('loading pickle data')
with open(f_in2, 'rb') as f:
    test_data = pickle.load(f)

with open(f_in3, 'rb') as f:
    test_docs = pickle.load(f)

words = {}
GetWords(test_data, test_docs, words)

print('loading idfs')
idf, max_idf = load_idfs(idf_pickle_path, words)
print('loading w2v')
wv          = KeyedVectors.load_word2vec_format(w2v_bin_path, binary=True)
wv          = dict([(word, wv[word]) for word in wv.vocab.keys() if (word in words)])

test_map    = get_one_map('test', test_data, test_docs, use_sent_tokenizer=True)

print(test_map)


# demo_retrieve.py