Skip to content

Commit

Permalink
Merge pull request #66 from wayfair-incubator/dependabot/pip/black-20…
Browse files Browse the repository at this point in the history
….8b1

Bump black from 19.10b0 to 20.8b1
  • Loading branch information
romatik authored Mar 17, 2021
2 parents 9125837 + 3e7bd8e commit 4e104a3
Show file tree
Hide file tree
Showing 9 changed files with 70 additions and 60 deletions.
14 changes: 7 additions & 7 deletions extra_model/_adjectives.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@

def cluster_adjectives(adjective_counts, vectorizer): # noqa: C901
"""Cluster adjectives based on a constant radius clustering algorithm.
Technical implementation uses a scikitlearn BallTree.
:param adjective_counts: dictionary with adjectives and their counts
:type adjective_counts: [(str,int)]
:param vectorizer: provide embeddings to evaluate adjective similarity
Expand Down Expand Up @@ -118,10 +118,10 @@ def cluster_adjectives(adjective_counts, vectorizer): # noqa: C901

def fill_sentiment_dict(adjective_counts):
"""Given a dictionary with adjectives and their counts, will compute.
The sentiment of each of the adjectives using the VADER sentiment analysis package
and return a dictionary of the adjectives and their sentiments.
:param adjective_counts: dictionary with adjectives and their counts
:type adjective_counts: dict
:return: dictionary, where the keys are the adjectives and the values are tuples of the
Expand All @@ -143,7 +143,7 @@ def fill_sentiment_dict(adjective_counts):

def sentiments_from_adjectives(adjective_counts, sentiment_dict):
"""Build the weighted average sentiment score from a list of adjetives and their counts.
:param adjective_counts: list of tuples with adjectives and their counts
:type adjective_counts: [(str,int)]
:param sentiment_dict: dictionary with adjectives and their sentiment, as tuple of compound and binary sentiment
Expand All @@ -170,11 +170,11 @@ def sentiments_from_adjectives(adjective_counts, sentiment_dict):

def adjective_info(dataframe_topics, dataframe_aspects, vectorizer):
"""Add adjective related information to the dataframes.
This has two facets:
-> for each topic cluster similar adjectives, to get a more abstract/readable list
-> for each topic, use the adjectives to come up with a sentiment classification
:param dataframe_topics: the dataframe with the topics we want to enrich, needs to have a collum `rawterms`
:type dataframe_topics: :class:`pandas.DataFrame`
:param dataframe_aspects: the dataframe with the aspect instances and related adjectives with columsn `aspect` and `descriptor`
Expand Down
17 changes: 9 additions & 8 deletions extra_model/_aspects.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def compound_noun_list(token):
"""Find compound nouns.
:param token: token for which to generate potential compound nouns
:type token: :class:`spacy.token`
:return: list of potential compounds
Expand All @@ -33,7 +33,7 @@ def compound_noun_list(token):

def acomp_list(tokens):
"""Find descriptions for a given token.
:param tokens: list of tokens that are children of the head of the nount for which descriptions are searched.
:type tokens: [:class:`spacy.token`]
:return: list of adjectives
Expand All @@ -55,7 +55,7 @@ def acomp_list(tokens):

def adjective_list(tokens):
"""Find adjectives modifying a given noun.
:param tokens: tokens of potential adjectice candidates (children of the noun and children of the head for compounds)
:type tokens: [:class:`spacy.token`]
:return: list of adjectives
Expand All @@ -77,7 +77,7 @@ def adjective_list(tokens):

def adjective_negations(token):
"""Find all negated adjectives in a sentence.
:param token: negation token to handle
:type token: :class:`spacy.token`
:return: list of negated adjectives
Expand Down Expand Up @@ -108,9 +108,9 @@ def adjective_negations(token):

def parse(dataframe_texts): # noqa: C901
"""Parse the comments and extract a list of potential aspects based on grammatical relations.
(e.g. modified by adjective)
:param dataframe_texts: a dataframe with the raw texts. The collumn wit the texts needs to be called 'Comments'
:type dataframe_texts: :class:`pandas.DataFrame`
:return: a dataframe with the aspect candidates
Expand All @@ -128,7 +128,8 @@ def parse(dataframe_texts): # noqa: C901
# n_threads > 5 can segfault with long (>500 tokens) sentences
# n_threads has been deprecated in spacy 3.x - https://spacy.io/usage/v2-1#incompat
for index, document in zip(
dataframe_texts.index, nlp.pipe(dataframe_texts.Comments, batch_size=500),
dataframe_texts.index,
nlp.pipe(dataframe_texts.Comments, batch_size=500),
): # TODO reduce for production/make configurable
negated_adjectives = []
for token in document:
Expand Down Expand Up @@ -171,7 +172,7 @@ def parse(dataframe_texts): # noqa: C901

def generate_aspects(dataframe_texts):
"""Generate the aspects that will be merged into topics from the raw texts.
:param dataframe_texts: a dataframe with the raw texts in the column 'Comments'
:type dataframe_texts: :class:`pandas.DataFrame`
:return: a dataframe with the aspect candidates, their associated description, index of original text in the
Expand Down
8 changes: 4 additions & 4 deletions extra_model/_disambiguate.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

def vectorize_aspects(aspect_counts, vectorizer):
"""Turn the aspect map into a a vector of nouns and their vector representations, which also filters aspects without embedding.
:param aspect_counts: (dict): the dictionary with aspect counts
:param vectorizer: (Vectorizer): the provider of word-embeddings
:return vectors with representable aspects and their vector embeddings
Expand All @@ -32,7 +32,7 @@ def vectorize_aspects(aspect_counts, vectorizer):
def best_cluster(aspect_vectors):
"""
Find the optimal cluster size using silhouette scores.
:param aspect_vectors: list of embeddings vectors to be clustered
:type aspect_vectors: [:class:`numpy.array`]
:return: the optimal number of clusters
Expand Down Expand Up @@ -73,7 +73,7 @@ def best_cluster(aspect_vectors):

def cluster(aspects, aspect_vectors, vectorizer):
"""Cluster aspects based on the distance of their vector representations.
Once clusters are found, use the other aspects in a given cluster to generate the
context for a specific aspect noun
Expand Down Expand Up @@ -115,7 +115,7 @@ def cluster(aspects, aspect_vectors, vectorizer):

def match(aspect_counts, vectorizer):
"""Match a word to a specific wordnet entry, using the vector similarity of the aspects context and the synonym gloss.
:param aspect_counts: Counter object of aspect->number of occurrence
:type aspect_counts: :class:`collections.Counter`
:param vectorizer: the provider of word-embeddings for context generation
Expand Down
2 changes: 1 addition & 1 deletion extra_model/_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

def filter(dataframe):
"""Filter a dataframe for language and text length.
The following rules apply:
1. Only comments with at least 20 characters retained.
2. Only comments in English are retained.
Expand Down
8 changes: 4 additions & 4 deletions extra_model/_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

def qa(dataframe_texts, dataframe_aspects, dataframe_topics):
"""Print summary information.
:param dataframe_texts: dataframe with the raw texts (for example output)
:type dataframe_texts: :class:`pandas.DataFrame`
:param dataframe_aspects: dataframe with the aspects
Expand Down Expand Up @@ -75,7 +75,7 @@ def qa(dataframe_texts, dataframe_aspects, dataframe_topics):

def set_aspect(topic, dataframe_aspects):
"""For a given topic, set topic and adjective cluster fields in the aspect_dataframe.
:param topic: the topic and it's associated information that we need to copy to the relevant entries in the aspect frame
:type topic: :class:`pandas.DataFrame.Row`
:param dataframe_aspects: the dataframe to be enriched with topic information
Expand All @@ -99,7 +99,7 @@ def set_aspect(topic, dataframe_aspects):

def link_aspects_to_topics(dataframe_aspects, dataframe_topics):
"""Fill topic and adjective cluster information into the aspect dataframe.
:param dataframe_aspects: the dataframe to be enriched
:type dataframe_aspects: :class:`pandas.DataFrame`
:param dataframe_topics: the dataframe that has the topic and adjective cluster information
Expand All @@ -122,7 +122,7 @@ def link_aspects_to_topics(dataframe_aspects, dataframe_topics):

def link_aspects_to_texts(dataframe_aspects, dataframe_texts):
"""Transfer the original text identifier from the original text data table into the final aspect table.
:param dataframe_aspects: table to be enriched
:type dataframe_aspects: :class:`pandas.DataFrame`
:param dataframe_texts: original table from which this information is extracted
Expand Down
18 changes: 9 additions & 9 deletions extra_model/_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

def path_to_graph(hypernym_list, initialnoun):
"""Make a hypernym chain into a graph.
:param hypernym_list: list of hypernyms for a word as obtained from wordnet
:type hypernym_list: [str]
:param initialnoun: the initial noun (we need this to mark it as leaf in the tree)
Expand All @@ -39,7 +39,7 @@ def path_to_graph(hypernym_list, initialnoun):

def get_nodevec(node, vectors):
"""Get the vector representation of a gloss a wordnet node.
Used to evaluate similarity between rungs in the hypernym chain.
:param node: the wornet node for which to compute the embedding
:type node: str
Expand All @@ -58,7 +58,7 @@ def get_nodevec(node, vectors):

def iterate(transition_matrix, importance, original, alpha):
"""Find the stable importance vector by iterated multiplication with the distance matrix.
This function does a simple iteration. The "jump-back" probability from the paper
is implemented as a linear superposition of the new and original importance numbers.
:param transition_matrix: The connectedness matrix of the graph, including similarity weights.
Expand All @@ -83,7 +83,7 @@ def iterate(transition_matrix, importance, original, alpha):

def aggregate(aspects, aspect_counts, synsets_match, vectors): # noqa: C901
"""Aggregate the aspects by building a tree from the hypernym chains.
Using a page-rank type algorithm to assign importance to the nodes in the graph
we only consider wordnet entries for this, not the actual aspects extracted from the texts.
:param aspects: List of aspects to analyze
Expand Down Expand Up @@ -210,7 +210,7 @@ def traverse_tree( # noqa: C901
node_list, associated_aspects, aspect_counts, full_tree, weighted, direction
):
"""Find all hypernyms/hyponyms in the tree to a given node.
Aggregate the number of associated mentions in the original texts, optionally
weighted by term-similarity.
:param nodelist: List of nodes from which to gather the subsidiary terms and their initial mentions
Expand Down Expand Up @@ -267,7 +267,7 @@ def traverse_tree( # noqa: C901

def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree):
"""Gather various bits of information into a single DataFrame.
For each topic we store the importance, the list of associated raw text terms and their numbers.
:param filtered_topics: List of topics remaining after filtering out low-iimportance subsidiary topics
:type filtered_topics: [str]
Expand Down Expand Up @@ -350,7 +350,7 @@ def collect_topic_info(filtered_topics, removed_topics, aspect_counts, full_tree

def has_connection(term, prior, full_tree):
"""Check if two terms are connected within the directed hyopernym graph.
:param term: first node to test
:type term: str
:param prior: second node to test
Expand All @@ -369,7 +369,7 @@ def has_connection(term, prior, full_tree):

def filter_aggregates(topics, tree):
"""Filter the importance-sorted list, so that each remaining topic is the sole member of its hypernym chain.
:param topics: List of all topics in the graph
:type topics: [str]
:param tree: the graph which is being traversed
Expand Down Expand Up @@ -398,7 +398,7 @@ def filter_aggregates(topics, tree):

def get_topics(dataframe_aspects, vectors):
"""Generate the semantically clustered topics from the raw aspects.
:param dataframe_aspects: the collection of nouns to be aggregated into topics
:type dataframe_aspects: :class:`pandas.DataFrame`
:param vectors: provides embeddings for context clustering and wordsense disammbguation
Expand Down
4 changes: 2 additions & 2 deletions extra_model/_vectorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Vectorizer:
def __init__(self, embedding_file):
"""
Use the generic gensim vector embedding lookup.
Currently using pretrained glove embeddings, but anything goes.
:param embedding_file: pathname for the file that stores the word-embeddings in gensim keyed-vectors format
:type str
Expand All @@ -25,7 +25,7 @@ def __init__(self, embedding_file):
def get_vector(self, key):
"""
Return the vector embedding for a given word.
According to the following logic:
- if no embedding is found for this word, check if it's a compound
- if it's a compound try to take the average embedding of the constituent words
Expand Down
2 changes: 1 addition & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
bandit==1.7.0
black==19.10b0
black==20.8b1
flake8==3.9.0
isort==5.7.0
mypy==0.812
Expand Down
57 changes: 33 additions & 24 deletions tests/test_topics.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,25 +138,31 @@ def test__aggregate(vec):


def test__traverse_tree__down_weighted(simple_graph):
assert traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=True,
direction="down",
) == {"L1": 2, "L2": 0.5}
assert (
traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=True,
direction="down",
)
== {"L1": 2, "L2": 0.5}
)


def test__traverse_tree__down_unweighted(simple_graph):
assert traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="down",
) == {"L1": 4, "L2": 1}
assert (
traverse_tree(
[("R", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="down",
)
== {"L1": 4, "L2": 1}
)


def test__traverse_tree__up_weighted(simple_graph):
Expand All @@ -166,14 +172,17 @@ def test__traverse_tree__up_weighted(simple_graph):


def test__traverse_tree__up_unweighted(simple_graph):
assert traverse_tree(
[("I1", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="up",
) == {"L1": 4}
assert (
traverse_tree(
[("I1", 1)],
{},
{"L1": 4, "L2": 1},
simple_graph,
weighted=False,
direction="up",
)
== {"L1": 4}
)


@pytest.mark.skip(
Expand Down

0 comments on commit 4e104a3

Please sign in to comment.