From 2e9679769621449db4fa656483d956628cd52f96 Mon Sep 17 00:00:00 2001 From: Joe Schiff <41972063+JoeSchiff@users.noreply.github.com> Date: Tue, 16 Apr 2024 05:51:14 -0400 Subject: [PATCH 01/29] Convert properties to decorator syntax (#13390) --- spacy/lexeme.pyx | 429 ++++++++++++++++++++----------------- spacy/tokenizer.pyx | 125 ++++++----- spacy/tokens/doc.pyx | 173 +++++++-------- spacy/tokens/span.pyx | 148 +++++++------ spacy/tokens/token.pyx | 334 +++++++++++++++-------------- spacy/training/example.pyx | 36 ++-- spacy/vocab.pyx | 42 ++-- 7 files changed, 684 insertions(+), 603 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f803d5e9394..7a0c19bf301 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -164,45 +164,48 @@ cdef class Lexeme: vector = self.vector return numpy.sqrt((vector**2).sum()) - property vector: + @property + def vector(self): """A real-valued meaning representation. RETURNS (numpy.ndarray[ndim=1, dtype='float32']): A 1D numpy array representing the lexeme's semantics. """ - def __get__(self): - cdef int length = self.vocab.vectors_length - if length == 0: - raise ValueError(Errors.E010) - return self.vocab.get_vector(self.c.orth) - - def __set__(self, vector): - if len(vector) != self.vocab.vectors_length: - raise ValueError(Errors.E073.format(new_length=len(vector), - length=self.vocab.vectors_length)) - self.vocab.set_vector(self.c.orth, vector) - - property rank: + cdef int length = self.vocab.vectors_length + if length == 0: + raise ValueError(Errors.E010) + return self.vocab.get_vector(self.c.orth) + + @vector.setter + def vector(self, vector): + if len(vector) != self.vocab.vectors_length: + raise ValueError(Errors.E073.format(new_length=len(vector), + length=self.vocab.vectors_length)) + self.vocab.set_vector(self.c.orth, vector) + + @property + def rank(self): """RETURNS (str): Sequential ID of the lexeme's lexical type, used to index into tables, e.g. for word vectors.""" - def __get__(self): - return self.c.id + return self.c.id - def __set__(self, value): - self.c.id = value + @rank.setter + def rank(self, value): + self.c.id = value - property sentiment: + @property + def sentiment(self): """RETURNS (float): A scalar value indicating the positivity or negativity of the lexeme.""" - def __get__(self): - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) - return sentiment_table.get(self.c.orth, 0.0) + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment", {}) + return sentiment_table.get(self.c.orth, 0.0) - def __set__(self, float x): - if "lexeme_sentiment" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_sentiment") - sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") - sentiment_table[self.c.orth] = x + @sentiment.setter + def sentiment(self, float x): + if "lexeme_sentiment" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_sentiment") + sentiment_table = self.vocab.lookups.get_table("lexeme_sentiment") + sentiment_table[self.c.orth] = x @property def orth_(self): @@ -216,306 +219,338 @@ cdef class Lexeme: """RETURNS (str): The original verbatim text of the lexeme.""" return self.orth_ - property lower: + @property + def lower(self): """RETURNS (uint64): Lowercase form of the lexeme.""" - def __get__(self): - return self.c.lower + return self.c.lower - def __set__(self, attr_t x): - self.c.lower = x + @lower.setter + def lower(self, attr_t x): + self.c.lower = x - property norm: + @property + def norm(self): """RETURNS (uint64): The lexeme's norm, i.e. a normalised form of the lexeme text. """ - def __get__(self): - return self.c.norm + return self.c.norm - def __set__(self, attr_t x): - if "lexeme_norm" not in self.vocab.lookups: - self.vocab.lookups.add_table("lexeme_norm") - norm_table = self.vocab.lookups.get_table("lexeme_norm") - norm_table[self.c.orth] = self.vocab.strings[x] - self.c.norm = x + @norm.setter + def norm(self, attr_t x): + if "lexeme_norm" not in self.vocab.lookups: + self.vocab.lookups.add_table("lexeme_norm") + norm_table = self.vocab.lookups.get_table("lexeme_norm") + norm_table[self.c.orth] = self.vocab.strings[x] + self.c.norm = x - property shape: + @property + def shape(self): """RETURNS (uint64): Transform of the word's string, to show orthographic features. """ - def __get__(self): - return self.c.shape + return self.c.shape - def __set__(self, attr_t x): - self.c.shape = x + @shape.setter + def shape(self, attr_t x): + self.c.shape = x - property prefix: + @property + def prefix(self): """RETURNS (uint64): Length-N substring from the start of the word. Defaults to `N=1`. """ - def __get__(self): - return self.c.prefix + return self.c.prefix - def __set__(self, attr_t x): - self.c.prefix = x + @prefix.setter + def prefix(self, attr_t x): + self.c.prefix = x - property suffix: + @property + def suffix(self): """RETURNS (uint64): Length-N substring from the end of the word. Defaults to `N=3`. """ - def __get__(self): - return self.c.suffix + return self.c.suffix - def __set__(self, attr_t x): - self.c.suffix = x + @suffix.setter + def suffix(self, attr_t x): + self.c.suffix = x - property cluster: + @property + def cluster(self): """RETURNS (int): Brown cluster ID.""" - def __get__(self): - cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) - return cluster_table.get(self.c.orth, 0) + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) + return cluster_table.get(self.c.orth, 0) - def __set__(self, int x): - cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) - cluster_table[self.c.orth] = x + @cluster.setter + def cluster(self, int x): + cluster_table = self.vocab.lookups.get_table("lexeme_cluster", {}) + cluster_table[self.c.orth] = x - property lang: + @property + def lang(self): """RETURNS (uint64): Language of the parent vocabulary.""" - def __get__(self): - return self.c.lang + return self.c.lang - def __set__(self, attr_t x): - self.c.lang = x + @lang.setter + def lang(self, attr_t x): + self.c.lang = x - property prob: + @property + def prob(self): """RETURNS (float): Smoothed log probability estimate of the lexeme's type.""" - def __get__(self): - prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) - settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) - default_oov_prob = settings_table.get("oov_prob", -20.0) - return prob_table.get(self.c.orth, default_oov_prob) + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + settings_table = self.vocab.lookups.get_table("lexeme_settings", {}) + default_oov_prob = settings_table.get("oov_prob", -20.0) + return prob_table.get(self.c.orth, default_oov_prob) - def __set__(self, float x): - prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) - prob_table[self.c.orth] = x + @prob.setter + def prob(self, float x): + prob_table = self.vocab.lookups.get_table("lexeme_prob", {}) + prob_table[self.c.orth] = x - property lower_: + @property + def lower_(self): """RETURNS (str): Lowercase form of the word.""" - def __get__(self): - return self.vocab.strings[self.c.lower] + return self.vocab.strings[self.c.lower] - def __set__(self, str x): - self.c.lower = self.vocab.strings.add(x) + @lower_.setter + def lower_(self, str x): + self.c.lower = self.vocab.strings.add(x) - property norm_: + @property + def norm_(self): """RETURNS (str): The lexeme's norm, i.e. a normalised form of the lexeme text. """ - def __get__(self): - return self.vocab.strings[self.c.norm] + return self.vocab.strings[self.c.norm] - def __set__(self, str x): - self.norm = self.vocab.strings.add(x) + @norm_.setter + def norm_(self, str x): + self.norm = self.vocab.strings.add(x) - property shape_: + @property + def shape_(self): """RETURNS (str): Transform of the word's string, to show orthographic features. """ - def __get__(self): - return self.vocab.strings[self.c.shape] + return self.vocab.strings[self.c.shape] - def __set__(self, str x): - self.c.shape = self.vocab.strings.add(x) + @shape_.setter + def shape_(self, str x): + self.c.shape = self.vocab.strings.add(x) - property prefix_: + @property + def prefix_(self): """RETURNS (str): Length-N substring from the start of the word. Defaults to `N=1`. """ - def __get__(self): - return self.vocab.strings[self.c.prefix] + return self.vocab.strings[self.c.prefix] - def __set__(self, str x): - self.c.prefix = self.vocab.strings.add(x) + @prefix_.setter + def prefix_(self, str x): + self.c.prefix = self.vocab.strings.add(x) - property suffix_: + @property + def suffix_(self): """RETURNS (str): Length-N substring from the end of the word. Defaults to `N=3`. """ - def __get__(self): - return self.vocab.strings[self.c.suffix] + return self.vocab.strings[self.c.suffix] - def __set__(self, str x): - self.c.suffix = self.vocab.strings.add(x) + @suffix_.setter + def suffix_(self, str x): + self.c.suffix = self.vocab.strings.add(x) - property lang_: + @property + def lang_(self): """RETURNS (str): Language of the parent vocabulary.""" - def __get__(self): - return self.vocab.strings[self.c.lang] + return self.vocab.strings[self.c.lang] - def __set__(self, str x): - self.c.lang = self.vocab.strings.add(x) + @lang_.setter + def lang_(self, str x): + self.c.lang = self.vocab.strings.add(x) - property flags: + @property + def flags(self): """RETURNS (uint64): Container of the lexeme's binary flags.""" - def __get__(self): - return self.c.flags + return self.c.flags - def __set__(self, flags_t x): - self.c.flags = x + @flags.setter + def flags(self, flags_t x): + self.c.flags = x @property def is_oov(self): """RETURNS (bool): Whether the lexeme is out-of-vocabulary.""" return self.orth not in self.vocab.vectors - property is_stop: + @property + def is_stop(self): """RETURNS (bool): Whether the lexeme is a stop word.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_STOP) + return Lexeme.c_check_flag(self.c, IS_STOP) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_STOP, x) + @is_stop.setter + def is_stop(self, bint x): + Lexeme.c_set_flag(self.c, IS_STOP, x) - property is_alpha: + @property + def is_alpha(self): """RETURNS (bool): Whether the lexeme consists of alphabetic characters. Equivalent to `lexeme.text.isalpha()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_ALPHA) + return Lexeme.c_check_flag(self.c, IS_ALPHA) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_ALPHA, x) + @is_alpha.setter + def is_alpha(self, bint x): + Lexeme.c_set_flag(self.c, IS_ALPHA, x) - property is_ascii: + @property + def is_ascii(self): """RETURNS (bool): Whether the lexeme consists of ASCII characters. Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_ASCII) + return Lexeme.c_check_flag(self.c, IS_ASCII) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_ASCII, x) + @is_ascii.setter + def is_ascii(self, bint x): + Lexeme.c_set_flag(self.c, IS_ASCII, x) - property is_digit: + @property + def is_digit(self): """RETURNS (bool): Whether the lexeme consists of digits. Equivalent to `lexeme.text.isdigit()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_DIGIT) + return Lexeme.c_check_flag(self.c, IS_DIGIT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_DIGIT, x) + @is_digit.setter + def is_digit(self, bint x): + Lexeme.c_set_flag(self.c, IS_DIGIT, x) - property is_lower: + @property + def is_lower(self): """RETURNS (bool): Whether the lexeme is in lowercase. Equivalent to `lexeme.text.islower()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_LOWER) + return Lexeme.c_check_flag(self.c, IS_LOWER) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_LOWER, x) + @is_lower.setter + def is_lower(self, bint x): + Lexeme.c_set_flag(self.c, IS_LOWER, x) - property is_upper: + @property + def is_upper(self): """RETURNS (bool): Whether the lexeme is in uppercase. Equivalent to `lexeme.text.isupper()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_UPPER) + return Lexeme.c_check_flag(self.c, IS_UPPER) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_UPPER, x) + @is_upper.setter + def is_upper(self, bint x): + Lexeme.c_set_flag(self.c, IS_UPPER, x) - property is_title: + @property + def is_title(self): """RETURNS (bool): Whether the lexeme is in titlecase. Equivalent to `lexeme.text.istitle()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_TITLE) + return Lexeme.c_check_flag(self.c, IS_TITLE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_TITLE, x) + @is_title.setter + def is_title(self, bint x): + Lexeme.c_set_flag(self.c, IS_TITLE, x) - property is_punct: + @property + def is_punct(self): """RETURNS (bool): Whether the lexeme is punctuation.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_PUNCT) + return Lexeme.c_check_flag(self.c, IS_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_PUNCT, x) + @is_punct.setter + def is_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_PUNCT, x) - property is_space: + @property + def is_space(self): """RETURNS (bool): Whether the lexeme consist of whitespace characters. Equivalent to `lexeme.text.isspace()`. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_SPACE) + return Lexeme.c_check_flag(self.c, IS_SPACE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_SPACE, x) + @is_space.setter + def is_space(self, bint x): + Lexeme.c_set_flag(self.c, IS_SPACE, x) - property is_bracket: + @property + def is_bracket(self): """RETURNS (bool): Whether the lexeme is a bracket.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_BRACKET) + return Lexeme.c_check_flag(self.c, IS_BRACKET) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_BRACKET, x) + @is_bracket.setter + def is_bracket(self, bint x): + Lexeme.c_set_flag(self.c, IS_BRACKET, x) - property is_quote: + @property + def is_quote(self): """RETURNS (bool): Whether the lexeme is a quotation mark.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_QUOTE) + return Lexeme.c_check_flag(self.c, IS_QUOTE) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_QUOTE, x) + @is_quote.setter + def is_quote(self, bint x): + Lexeme.c_set_flag(self.c, IS_QUOTE, x) - property is_left_punct: + @property + def is_left_punct(self): """RETURNS (bool): Whether the lexeme is left punctuation, e.g. (.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) + return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) + @is_left_punct.setter + def is_left_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) - property is_right_punct: + @property + def is_right_punct(self): """RETURNS (bool): Whether the lexeme is right punctuation, e.g. ).""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) + return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) + @is_right_punct.setter + def is_right_punct(self, bint x): + Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) - property is_currency: + @property + def is_currency(self): """RETURNS (bool): Whether the lexeme is a currency symbol, e.g. $, €.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, IS_CURRENCY) + return Lexeme.c_check_flag(self.c, IS_CURRENCY) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, IS_CURRENCY, x) + @is_currency.setter + def is_currency(self, bint x): + Lexeme.c_set_flag(self.c, IS_CURRENCY, x) - property like_url: + @property + def like_url(self): """RETURNS (bool): Whether the lexeme resembles a URL.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_URL) + return Lexeme.c_check_flag(self.c, LIKE_URL) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_URL, x) + @like_url.setter + def like_url(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_URL, x) - property like_num: + @property + def like_num(self): """RETURNS (bool): Whether the lexeme represents a number, e.g. "10.9", "10", "ten", etc. """ - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_NUM) + return Lexeme.c_check_flag(self.c, LIKE_NUM) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_NUM, x) + @like_num.setter + def like_num(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_NUM, x) - property like_email: + @property + def like_email(self): """RETURNS (bool): Whether the lexeme resembles an email address.""" - def __get__(self): - return Lexeme.c_check_flag(self.c, LIKE_EMAIL) + return Lexeme.c_check_flag(self.c, LIKE_EMAIL) - def __set__(self, bint x): - Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) + @like_email.setter + def like_email(self, bint x): + Lexeme.c_set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 6f2b10734c5..96545828fde 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -70,65 +70,72 @@ cdef class Tokenizer: self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) - property token_match: - def __get__(self): - return self._token_match - - def __set__(self, token_match): - self._token_match = token_match - self._reload_special_cases() - - property url_match: - def __get__(self): - return self._url_match - - def __set__(self, url_match): - self._url_match = url_match - self._reload_special_cases() - - property prefix_search: - def __get__(self): - return self._prefix_search - - def __set__(self, prefix_search): - self._prefix_search = prefix_search - self._reload_special_cases() - - property suffix_search: - def __get__(self): - return self._suffix_search - - def __set__(self, suffix_search): - self._suffix_search = suffix_search - self._reload_special_cases() - - property infix_finditer: - def __get__(self): - return self._infix_finditer - - def __set__(self, infix_finditer): - self._infix_finditer = infix_finditer - self._reload_special_cases() - - property rules: - def __get__(self): - return self._rules - - def __set__(self, rules): - self._rules = {} - self._flush_cache() - self._flush_specials() - self._cache = PreshMap() - self._specials = PreshMap() - self._load_special_cases(rules) - - property faster_heuristics: - def __get__(self): - return bool(self._faster_heuristics) - - def __set__(self, faster_heuristics): - self._faster_heuristics = bool(faster_heuristics) - self._reload_special_cases() + @property + def token_match(self): + return self._token_match + + @token_match.setter + def token_match(self, token_match): + self._token_match = token_match + self._reload_special_cases() + + @property + def url_match(self): + return self._url_match + + @url_match.setter + def url_match(self, url_match): + self._url_match = url_match + self._reload_special_cases() + + @property + def prefix_search(self): + return self._prefix_search + + @prefix_search.setter + def prefix_search(self, prefix_search): + self._prefix_search = prefix_search + self._reload_special_cases() + + @property + def suffix_search(self): + return self._suffix_search + + @suffix_search.setter + def suffix_search(self, suffix_search): + self._suffix_search = suffix_search + self._reload_special_cases() + + @property + def infix_finditer(self): + return self._infix_finditer + + @infix_finditer.setter + def infix_finditer(self, infix_finditer): + self._infix_finditer = infix_finditer + self._reload_special_cases() + + @property + def rules(self): + return self._rules + + @rules.setter + def rules(self, rules): + self._rules = {} + self._flush_cache() + self._flush_specials() + self._cache = PreshMap() + self._specials = PreshMap() + self._load_special_cases(rules) + + @property + def faster_heuristics(self): + return bool(self._faster_heuristics) + + @faster_heuristics.setter + def faster_heuristics(self, faster_heuristics): + self._faster_heuristics = bool(faster_heuristics) + self._reload_special_cases() def __reduce__(self): args = (self.vocab, diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 181c0ce0fce..4d624956968 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -667,7 +667,8 @@ cdef class Doc: else: return False - property vector: + @property + def vector(self): """A real-valued meaning representation. Defaults to an average of the token vectors. @@ -676,48 +677,49 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#vector """ - def __get__(self): - if "vector" in self.user_hooks: - return self.user_hooks["vector"](self) - if self._vector is not None: - return self._vector - xp = get_array_module(self.vocab.vectors.data) - if not len(self): - self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") - return self._vector - elif self.vocab.vectors.size > 0: - self._vector = sum(t.vector for t in self) / len(self) - return self._vector - elif self.tensor.size > 0: - self._vector = self.tensor.mean(axis=0) - return self._vector - else: - return xp.zeros((self.vocab.vectors_length,), dtype="float32") + if "vector" in self.user_hooks: + return self.user_hooks["vector"](self) + if self._vector is not None: + return self._vector + xp = get_array_module(self.vocab.vectors.data) + if not len(self): + self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") + return self._vector + elif self.vocab.vectors.size > 0: + self._vector = sum(t.vector for t in self) / len(self) + return self._vector + elif self.tensor.size > 0: + self._vector = self.tensor.mean(axis=0) + return self._vector + else: + return xp.zeros((self.vocab.vectors_length,), dtype="float32") - def __set__(self, value): - self._vector = value + @vector.setter + def vector(self, value): + self._vector = value - property vector_norm: + @property + def vector_norm(self): """The L2 norm of the document's vector representation. RETURNS (float): The L2 norm of the vector representation. DOCS: https://spacy.io/api/doc#vector_norm """ - def __get__(self): - if "vector_norm" in self.user_hooks: - return self.user_hooks["vector_norm"](self) - cdef float value - cdef double norm = 0 - if self._vector_norm is None: - norm = 0.0 - for value in self.vector: - norm += value * value - self._vector_norm = sqrt(norm) if norm != 0 else 0 - return self._vector_norm - - def __set__(self, value): - self._vector_norm = value + if "vector_norm" in self.user_hooks: + return self.user_hooks["vector_norm"](self) + cdef float value + cdef double norm = 0 + if self._vector_norm is None: + norm = 0.0 + for value in self.vector: + norm += value * value + self._vector_norm = sqrt(norm) if norm != 0 else 0 + return self._vector_norm + + @vector_norm.setter + def vector_norm(self, value): + self._vector_norm = value @property def text(self): @@ -736,7 +738,8 @@ cdef class Doc: """ return self.text - property ents: + @property + def ents(self): """The named entities in the document. Returns a tuple of named entity `Span` objects, if the entity recognizer has been applied. @@ -744,55 +747,55 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#ents """ - def __get__(self): - cdef int i - cdef const TokenC* token - cdef int start = -1 - cdef attr_t label = 0 - cdef attr_t kb_id = 0 - cdef attr_t ent_id = 0 - output = [] - for i in range(self.length): - token = &self.c[i] - if token.ent_iob == 1: - if start == -1: - seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] - raise ValueError(Errors.E093.format(seq=" ".join(seq))) - elif token.ent_iob == 2 or token.ent_iob == 0 or \ - (token.ent_iob == 3 and token.ent_type == 0): - if start != -1: - output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) - start = -1 - label = 0 - kb_id = 0 - ent_id = 0 - elif token.ent_iob == 3: - if start != -1: - output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) - start = i - label = token.ent_type - kb_id = token.ent_kb_id - ent_id = token.ent_id - if start != -1: - output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) - # remove empty-label spans - output = [o for o in output if o.label_ != ""] - return tuple(output) - - def __set__(self, ents): - # TODO: - # 1. Test basic data-driven ORTH gazetteer - # 2. Test more nuanced date and currency regex - cdef attr_t kb_id, ent_id - cdef int ent_start, ent_end - ent_spans = [] - for ent_info in ents: - entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) - if isinstance(entity_type_, str): - self.vocab.strings.add(entity_type_) - span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) - ent_spans.append(span) - self.set_ents(ent_spans, default=SetEntsDefault.outside) + cdef int i + cdef const TokenC* token + cdef int start = -1 + cdef attr_t label = 0 + cdef attr_t kb_id = 0 + cdef attr_t ent_id = 0 + output = [] + for i in range(self.length): + token = &self.c[i] + if token.ent_iob == 1: + if start == -1: + seq = [f"{t.text}|{t.ent_iob_}" for t in self[i-5:i+5]] + raise ValueError(Errors.E093.format(seq=" ".join(seq))) + elif token.ent_iob == 2 or token.ent_iob == 0 or \ + (token.ent_iob == 3 and token.ent_type == 0): + if start != -1: + output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) + start = -1 + label = 0 + kb_id = 0 + ent_id = 0 + elif token.ent_iob == 3: + if start != -1: + output.append(Span(self, start, i, label=label, kb_id=kb_id, span_id=ent_id)) + start = i + label = token.ent_type + kb_id = token.ent_kb_id + ent_id = token.ent_id + if start != -1: + output.append(Span(self, start, self.length, label=label, kb_id=kb_id, span_id=ent_id)) + # remove empty-label spans + output = [o for o in output if o.label_ != ""] + return tuple(output) + + @ents.setter + def ents(self, ents): + # TODO: + # 1. Test basic data-driven ORTH gazetteer + # 2. Test more nuanced date and currency regex + cdef attr_t kb_id, ent_id + cdef int ent_start, ent_end + ent_spans = [] + for ent_info in ents: + entity_type_, kb_id, ent_start, ent_end, ent_id = get_entity_info(ent_info) + if isinstance(entity_type_, str): + self.vocab.strings.add(entity_type_) + span = Span(self, ent_start, ent_end, label=entity_type_, kb_id=kb_id, span_id=ent_id) + ent_spans.append(span) + self.set_ents(ent_spans, default=SetEntsDefault.outside) def set_ents(self, entities, *, blocked=None, missing=None, outside=None, default=SetEntsDefault.outside): """Set entity annotation. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index e179bbce7eb..64b8d7c6c1d 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -757,78 +757,87 @@ cdef class Span: for word in self.rights: yield from word.subtree - property start: - def __get__(self): - return self.c.start + @property + def start(self): + return self.c.start - def __set__(self, int start): - if start < 0: - raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) - self.c.start = start + @start.setter + def start(self, int start): + if start < 0: + raise IndexError(Errors.E1032.format(var="start", forbidden="< 0", value=start)) + self.c.start = start - property end: - def __get__(self): - return self.c.end + @property + def end(self): + return self.c.end - def __set__(self, int end): - if end < 0: - raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) - self.c.end = end + @end.setter + def end(self, int end): + if end < 0: + raise IndexError(Errors.E1032.format(var="end", forbidden="< 0", value=end)) + self.c.end = end - property start_char: - def __get__(self): - return self.c.start_char + @property + def start_char(self): + return self.c.start_char - def __set__(self, int start_char): - if start_char < 0: - raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) - self.c.start_char = start_char + @start_char.setter + def start_char(self, int start_char): + if start_char < 0: + raise IndexError(Errors.E1032.format(var="start_char", forbidden="< 0", value=start_char)) + self.c.start_char = start_char - property end_char: - def __get__(self): - return self.c.end_char + @property + def end_char(self): + return self.c.end_char - def __set__(self, int end_char): - if end_char < 0: - raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) - self.c.end_char = end_char + @end_char.setter + def end_char(self, int end_char): + if end_char < 0: + raise IndexError(Errors.E1032.format(var="end_char", forbidden="< 0", value=end_char)) + self.c.end_char = end_char - property label: - def __get__(self): - return self.c.label + @property + def label(self): + return self.c.label - def __set__(self, attr_t label): - self.c.label = label + @label.setter + def label(self, attr_t label): + self.c.label = label - property kb_id: - def __get__(self): - return self.c.kb_id + @property + def kb_id(self): + return self.c.kb_id - def __set__(self, attr_t kb_id): - self.c.kb_id = kb_id + @kb_id.setter + def kb_id(self, attr_t kb_id): + self.c.kb_id = kb_id - property id: - def __get__(self): - return self.c.id + @property + def id(self): + return self.c.id - def __set__(self, attr_t id): - self.c.id = id + @id.setter + def id(self, attr_t id): + self.c.id = id - property ent_id: + @property + def ent_id(self): """RETURNS (uint64): The entity ID.""" - def __get__(self): - return self.root.ent_id + return self.root.ent_id - def __set__(self, hash_t key): - raise NotImplementedError(Errors.E200.format(attr="ent_id")) + @ent_id.setter + def ent_id(self, hash_t key): + raise NotImplementedError(Errors.E200.format(attr="ent_id")) - property ent_id_: + @property + def ent_id_(self): """RETURNS (str): The (string) entity ID.""" - def __get__(self): - return self.root.ent_id_ + return self.root.ent_id_ - def __set__(self, str key): - raise NotImplementedError(Errors.E200.format(attr="ent_id_")) + @ent_id_.setter + def ent_id_(self, str key): + raise NotImplementedError(Errors.E200.format(attr="ent_id_")) @property def orth_(self): @@ -843,29 +852,32 @@ cdef class Span: """RETURNS (str): The span's lemma.""" return "".join([t.lemma_ + t.whitespace_ for t in self]).strip() - property label_: + @property + def label_(self): """RETURNS (str): The span's label.""" - def __get__(self): - return self.doc.vocab.strings[self.label] + return self.doc.vocab.strings[self.label] - def __set__(self, str label_): - self.label = self.doc.vocab.strings.add(label_) + @label_.setter + def label_(self, str label_): + self.label = self.doc.vocab.strings.add(label_) - property kb_id_: + @property + def kb_id_(self): """RETURNS (str): The span's KB ID.""" - def __get__(self): - return self.doc.vocab.strings[self.kb_id] + return self.doc.vocab.strings[self.kb_id] - def __set__(self, str kb_id_): - self.kb_id = self.doc.vocab.strings.add(kb_id_) + @kb_id_.setter + def kb_id_(self, str kb_id_): + self.kb_id = self.doc.vocab.strings.add(kb_id_) - property id_: + @property + def id_(self): """RETURNS (str): The span's ID.""" - def __get__(self): - return self.doc.vocab.strings[self.id] + return self.doc.vocab.strings[self.id] - def __set__(self, str id_): - self.id = self.doc.vocab.strings.add(id_) + @id_.setter + def id_(self, str id_): + self.id = self.doc.vocab.strings.add(id_) cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 2ed736b7035..a3efd5886ee 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -249,15 +249,16 @@ cdef class Token: """ return not self.c.morph == 0 - property morph: - def __get__(self): - return MorphAnalysis.from_id(self.vocab, self.c.morph) + @property + def morph(self): + return MorphAnalysis.from_id(self.vocab, self.c.morph) - def __set__(self, MorphAnalysis morph): - # Check that the morph has the same vocab - if self.vocab != morph.vocab: - raise ValueError(Errors.E1013) - self.c.morph = morph.c.key + @morph.setter + def morph(self, MorphAnalysis morph): + # Check that the morph has the same vocab + if self.vocab != morph.vocab: + raise ValueError(Errors.E1013) + self.c.morph = morph.c.key def set_morph(self, features): cdef hash_t key @@ -377,39 +378,43 @@ cdef class Token: """ return self.c.lex.suffix - property lemma: + @property + def lemma(self): """RETURNS (uint64): ID of the base form of the word, with no inflectional suffixes. """ - def __get__(self): - return self.c.lemma + return self.c.lemma - def __set__(self, attr_t lemma): - self.c.lemma = lemma + @lemma.setter + def lemma(self, attr_t lemma): + self.c.lemma = lemma - property pos: + @property + def pos(self): """RETURNS (uint64): ID of coarse-grained part-of-speech tag.""" - def __get__(self): - return self.c.pos + return self.c.pos - def __set__(self, pos): - self.c.pos = pos + @pos.setter + def pos(self, pos): + self.c.pos = pos - property tag: + @property + def tag(self): """RETURNS (uint64): ID of fine-grained part-of-speech tag.""" - def __get__(self): - return self.c.tag + return self.c.tag - def __set__(self, attr_t tag): - self.c.tag = tag + @tag.setter + def tag(self, attr_t tag): + self.c.tag = tag - property dep: + @property + def dep(self): """RETURNS (uint64): ID of syntactic dependency label.""" - def __get__(self): - return self.c.dep + return self.c.dep - def __set__(self, attr_t label): - self.c.dep = label + @dep.setter + def dep(self, attr_t label): + self.c.dep = label @property def has_vector(self): @@ -494,48 +499,51 @@ cdef class Token: return self.doc.user_token_hooks["sent"](self) return self.doc[self.i : self.i+1].sent - property sent_start: - def __get__(self): - """Deprecated: use Token.is_sent_start instead.""" - # Raising a deprecation warning here causes errors for autocomplete - # Handle broken backwards compatibility case: doc[0].sent_start - # was False. - if self.i == 0: - return False - else: - return self.c.sent_start + @property + def sent_start(self): + """Deprecated: use Token.is_sent_start instead.""" + # Raising a deprecation warning here causes errors for autocomplete + # Handle broken backwards compatibility case: doc[0].sent_start + # was False. + if self.i == 0: + return False + else: + return self.c.sent_start - def __set__(self, value): - self.is_sent_start = value + @sent_start.setter + def sent_start(self, value): + self.is_sent_start = value - property is_sent_start: + @property + def is_sent_start(self): """A boolean value indicating whether the token starts a sentence. `None` if unknown. Defaults to `True` for the first token in the `Doc`. RETURNS (bool / None): Whether the token starts a sentence. None if unknown. """ - def __get__(self): - if self.c.sent_start == 0: - return None - elif self.c.sent_start < 0: - return False - else: - return True + if self.c.sent_start == 0: + return None + elif self.c.sent_start < 0: + return False + else: + return True - def __set__(self, value): - if self.doc.has_annotation("DEP"): - raise ValueError(Errors.E043) - if value is None: - self.c.sent_start = 0 - elif value is True: - self.c.sent_start = 1 - elif value is False: - self.c.sent_start = -1 - else: - raise ValueError(Errors.E044.format(value=value)) + @is_sent_start.setter + def is_sent_start(self, value): + if self.doc.has_annotation("DEP"): + raise ValueError(Errors.E043) + if value is None: + self.c.sent_start = 0 + elif value is True: + self.c.sent_start = 1 + elif value is False: + self.c.sent_start = -1 + else: + raise ValueError(Errors.E044.format(value=value)) - property is_sent_end: + @property + def is_sent_end(self): """A boolean value indicating whether the token ends a sentence. `None` if unknown. Defaults to `True` for the last token in the `Doc`. @@ -544,18 +552,18 @@ cdef class Token: DOCS: https://spacy.io/api/token#is_sent_end """ - def __get__(self): - if self.i + 1 == len(self.doc): - return True - elif self.doc[self.i+1].is_sent_start is None: - return None - elif self.doc[self.i+1].is_sent_start is True: - return True - else: - return False + if self.i + 1 == len(self.doc): + return True + elif self.doc[self.i+1].is_sent_start is None: + return None + elif self.doc[self.i+1].is_sent_start is True: + return True + else: + return False - def __set__(self, value): - raise ValueError(Errors.E196) + @is_sent_end.setter + def is_sent_end(self, value): + raise ValueError(Errors.E196) @property def lefts(self): @@ -682,41 +690,42 @@ cdef class Token: """ return not Token.missing_head(self.c) - property head: + @property + def head(self): """The syntactic parent, or "governor", of this token. If token.has_head() is `False`, this method will return itself. RETURNS (Token): The token predicted by the parser to be the head of the current token. """ - def __get__(self): - if not self.has_head(): - return self - else: - return self.doc[self.i + self.c.head] - - def __set__(self, Token new_head): - # This function sets the head of self to new_head and updates the - # counters for left/right dependents and left/right corner for the - # new and the old head - # Check that token is from the same document - if self.doc != new_head.doc: - raise ValueError(Errors.E191) - # Do nothing if old head is new head - if self.i + self.c.head == new_head.i: - return - # Find the widest l/r_edges of the roots of the two tokens involved - # to limit the number of tokens for set_children_from_heads - cdef Token self_root, new_head_root - self_root = ([self] + list(self.ancestors))[-1] - new_head_ancestors = list(new_head.ancestors) - new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head - start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge - end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge - # Set new head - self.c.head = new_head.i - self.i - # Adjust parse properties and sentence starts - set_children_from_heads(self.doc.c, start, end + 1) + if not self.has_head(): + return self + else: + return self.doc[self.i + self.c.head] + + @head.setter + def head(self, Token new_head): + # This function sets the head of self to new_head and updates the + # counters for left/right dependents and left/right corner for the + # new and the old head + # Check that token is from the same document + if self.doc != new_head.doc: + raise ValueError(Errors.E191) + # Do nothing if old head is new head + if self.i + self.c.head == new_head.i: + return + # Find the widest l/r_edges of the roots of the two tokens involved + # to limit the number of tokens for set_children_from_heads + cdef Token self_root, new_head_root + self_root = ([self] + list(self.ancestors))[-1] + new_head_ancestors = list(new_head.ancestors) + new_head_root = new_head_ancestors[-1] if new_head_ancestors else new_head + start = self_root.c.l_edge if self_root.c.l_edge < new_head_root.c.l_edge else new_head_root.c.l_edge + end = self_root.c.r_edge if self_root.c.r_edge > new_head_root.c.r_edge else new_head_root.c.r_edge + # Set new head + self.c.head = new_head.i - self.i + # Adjust parse properties and sentence starts + set_children_from_heads(self.doc.c, start, end + 1) @property def conjuncts(self): @@ -744,21 +753,23 @@ cdef class Token: queue.append(child) return tuple([w for w in output if w.i != self.i]) - property ent_type: + @property + def ent_type(self): """RETURNS (uint64): Named entity type.""" - def __get__(self): - return self.c.ent_type + return self.c.ent_type - def __set__(self, ent_type): - self.c.ent_type = ent_type + @ent_type.setter + def ent_type(self, ent_type): + self.c.ent_type = ent_type - property ent_type_: + @property + def ent_type_(self): """RETURNS (str): Named entity type.""" - def __get__(self): - return self.vocab.strings[self.c.ent_type] + return self.vocab.strings[self.c.ent_type] - def __set__(self, ent_type): - self.c.ent_type = self.vocab.strings.add(ent_type) + @ent_type_.setter + def ent_type_(self, ent_type): + self.c.ent_type = self.vocab.strings.add(ent_type) @property def ent_iob(self): @@ -784,41 +795,45 @@ cdef class Token: """ return self.iob_strings()[self.c.ent_iob] - property ent_id: + @property + def ent_id(self): """RETURNS (uint64): ID of the entity the token is an instance of, if any. """ - def __get__(self): - return self.c.ent_id + return self.c.ent_id - def __set__(self, hash_t key): - self.c.ent_id = key + @ent_id.setter + def ent_id(self, hash_t key): + self.c.ent_id = key - property ent_id_: + @property + def ent_id_(self): """RETURNS (str): ID of the entity the token is an instance of, if any. """ - def __get__(self): - return self.vocab.strings[self.c.ent_id] + return self.vocab.strings[self.c.ent_id] - def __set__(self, name): - self.c.ent_id = self.vocab.strings.add(name) + @ent_id_.setter + def ent_id_(self, name): + self.c.ent_id = self.vocab.strings.add(name) - property ent_kb_id: + @property + def ent_kb_id(self): """RETURNS (uint64): Named entity KB ID.""" - def __get__(self): - return self.c.ent_kb_id + return self.c.ent_kb_id - def __set__(self, attr_t ent_kb_id): - self.c.ent_kb_id = ent_kb_id + @ent_kb_id.setter + def ent_kb_id(self, attr_t ent_kb_id): + self.c.ent_kb_id = ent_kb_id - property ent_kb_id_: + @property + def ent_kb_id_(self): """RETURNS (str): Named entity KB ID.""" - def __get__(self): - return self.vocab.strings[self.c.ent_kb_id] + return self.vocab.strings[self.c.ent_kb_id] - def __set__(self, ent_kb_id): - self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) + @ent_kb_id_.setter + def ent_kb_id_(self, ent_kb_id): + self.c.ent_kb_id = self.vocab.strings.add(ent_kb_id) @property def whitespace_(self): @@ -840,16 +855,17 @@ cdef class Token: """ return self.vocab.strings[self.c.lex.lower] - property norm_: + @property + def norm_(self): """RETURNS (str): The token's norm, i.e. a normalised form of the token text. Usually set in the language's tokenizer exceptions or norm exceptions. """ - def __get__(self): - return self.vocab.strings[self.norm] + return self.vocab.strings[self.norm] - def __set__(self, str norm_): - self.c.norm = self.vocab.strings.add(norm_) + @norm_.setter + def norm_(self, str norm_): + self.c.norm = self.vocab.strings.add(norm_) @property def shape_(self): @@ -879,33 +895,36 @@ cdef class Token: """ return self.vocab.strings[self.c.lex.lang] - property lemma_: + @property + def lemma_(self): """RETURNS (str): The token lemma, i.e. the base form of the word, with no inflectional suffixes. """ - def __get__(self): - return self.vocab.strings[self.c.lemma] + return self.vocab.strings[self.c.lemma] - def __set__(self, str lemma_): - self.c.lemma = self.vocab.strings.add(lemma_) + @lemma_.setter + def lemma_(self, str lemma_): + self.c.lemma = self.vocab.strings.add(lemma_) - property pos_: + @property + def pos_(self): """RETURNS (str): Coarse-grained part-of-speech tag.""" - def __get__(self): - return parts_of_speech.NAMES[self.c.pos] + return parts_of_speech.NAMES[self.c.pos] - def __set__(self, pos_name): - if pos_name not in parts_of_speech.IDS: - raise ValueError(Errors.E1021.format(pp=pos_name)) - self.c.pos = parts_of_speech.IDS[pos_name] + @pos_.setter + def pos_(self, pos_name): + if pos_name not in parts_of_speech.IDS: + raise ValueError(Errors.E1021.format(pp=pos_name)) + self.c.pos = parts_of_speech.IDS[pos_name] - property tag_: + @property + def tag_(self): """RETURNS (str): Fine-grained part-of-speech tag.""" - def __get__(self): - return self.vocab.strings[self.c.tag] + return self.vocab.strings[self.c.tag] - def __set__(self, tag): - self.tag = self.vocab.strings.add(tag) + @tag_.setter + def tag_(self, tag): + self.tag = self.vocab.strings.add(tag) def has_dep(self): """Check whether the token has annotated dep information. @@ -915,13 +934,14 @@ cdef class Token: """ return not Token.missing_dep(self.c) - property dep_: + @property + def dep_(self): """RETURNS (str): The syntactic dependency label.""" - def __get__(self): - return self.vocab.strings[self.c.dep] + return self.vocab.strings[self.c.dep] - def __set__(self, str label): - self.c.dep = self.vocab.strings.add(label) + @dep_.setter + def dep_(self, str label): + self.c.dep = self.vocab.strings.add(label) @property def is_oov(self): diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx index abdcecf71d1..2c1ff34cf2f 100644 --- a/spacy/training/example.pyx +++ b/spacy/training/example.pyx @@ -88,23 +88,25 @@ cdef class Example: def __len__(self): return len(self.predicted) - property predicted: - def __get__(self): - return self.x + @property + def predicted(self): + return self.x - def __set__(self, doc): - self.x = doc - self._cached_alignment = None - self._cached_words_x = [t.text for t in doc] + @predicted.setter + def predicted(self, doc): + self.x = doc + self._cached_alignment = None + self._cached_words_x = [t.text for t in doc] - property reference: - def __get__(self): - return self.y + @property + def reference(self): + return self.y - def __set__(self, doc): - self.y = doc - self._cached_alignment = None - self._cached_words_y = [t.text for t in doc] + @reference.setter + def reference(self, doc): + self.y = doc + self._cached_alignment = None + self._cached_words_y = [t.text for t in doc] def copy(self): return Example( @@ -420,9 +422,9 @@ cdef class Example: seen_indices.update(indices) return output - property text: - def __get__(self): - return self.x.text + @property + def text(self): + return self.x.text def __str__(self): return str(self.to_dict()) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4004a70e034..19e6eb005c0 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -88,16 +88,17 @@ cdef class Vocab: self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks - property vectors: - def __get__(self): - return self._vectors + @property + def vectors(self): + return self._vectors - def __set__(self, vectors): - if hasattr(vectors, "strings"): - for s in vectors.strings: - self.strings.add(s) - self._vectors = vectors - self._vectors.strings = self.strings + @vectors.setter + def vectors(self, vectors): + if hasattr(vectors, "strings"): + for s in vectors.strings: + self.strings.add(s) + self._vectors = vectors + self._vectors.strings = self.strings @property def lang(self): @@ -464,17 +465,18 @@ cdef class Vocab: key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) return key in self.vectors - property lookups: - def __get__(self): - return self._lookups - - def __set__(self, lookups): - self._lookups = lookups - if lookups.has_table("lexeme_norm"): - self.lex_attr_getters[NORM] = util.add_lookups( - self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), - self.lookups.get_table("lexeme_norm"), - ) + @property + def lookups(self): + return self._lookups + + @lookups.setter + def lookups(self, lookups): + self._lookups = lookups + if lookups.has_table("lexeme_norm"): + self.lex_attr_getters[NORM] = util.add_lookups( + self.lex_attr_getters.get(NORM, LEX_ATTRS[NORM]), + self.lookups.get_table("lexeme_norm"), + ) def to_disk(self, path, *, exclude=tuple()): """Save the current state to a directory. From 2e2334632beb0e91abc1d7820a0471a10af61489 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 16 Apr 2024 12:00:22 +0200 Subject: [PATCH 02/29] Fix use_gold_ents behaviour for EntityLinker (#13400) * fix type annotation in docs * only restore entities after loss calculation * restore entities of sample in initialization * rename overfitting function * fix EL scorer * Relax test * fix formatting * Update spacy/pipeline/entity_linker.py Co-authored-by: Raphael Mitsch * rename to _ensure_ents * further rename * allow for scorer to be None --------- Co-authored-by: Raphael Mitsch --- spacy/pipeline/entity_linker.py | 63 +++++++----- spacy/tests/pipeline/test_entity_linker.py | 107 ++++++++++++++++++++- website/docs/api/entitylinker.mdx | 2 +- 3 files changed, 145 insertions(+), 27 deletions(-) diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a730ece1bfa..40a9c8a79dc 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -11,7 +11,6 @@ from ..errors import Errors from ..kb import Candidate, KnowledgeBase from ..language import Language -from ..ml import empty_kb from ..scorer import Scorer from ..tokens import Doc, Span from ..training import Example, validate_examples, validate_get_examples @@ -105,7 +104,7 @@ def make_entity_linker( ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. - use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another + use_gold_ents (bool): Whether to copy entities from gold docs during training or not. If false, another component must provide entity annotations. candidates_batch_size (int): Size of batches for entity candidate generation. threshold (Optional[float]): Confidence threshold for entity predictions. If confidence is below the threshold, @@ -235,7 +234,6 @@ def __init__( self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) self.kb = generate_empty_kb(self.vocab, entity_vector_length) - self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size self.threshold = threshold @@ -243,6 +241,37 @@ def __init__( if candidates_batch_size < 1: raise ValueError(Errors.E1044) + def _score_with_ents_set(examples: Iterable[Example], **kwargs): + # Because of how spaCy works, we can't just score immediately, because Language.evaluate + # calls pipe() on the predicted docs, which won't have entities if there is no NER in the pipeline. + if not scorer: + return scorer + if not self.use_gold_ents: + return scorer(examples, **kwargs) + else: + examples = self._ensure_ents(examples) + docs = self.pipe( + (eg.predicted for eg in examples), + ) + for eg, doc in zip(examples, docs): + eg.predicted = doc + return scorer(examples, **kwargs) + + self.scorer = _score_with_ents_set + + def _ensure_ents(self, examples: Iterable[Example]) -> Iterable[Example]: + """If use_gold_ents is true, set the gold entities to (a copy of) eg.predicted.""" + if not self.use_gold_ents: + return examples + + new_examples = [] + for eg in examples: + ents, _ = eg.get_aligned_ents_and_ner() + new_eg = eg.copy() + new_eg.predicted.ents = ents + new_examples.append(new_eg) + return new_examples + def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]): """Define the KB of this pipe by providing a function that will create it using this object's vocab.""" @@ -284,11 +313,9 @@ def initialize( nO = self.kb.entity_vector_length doc_sample = [] vector_sample = [] - for eg in islice(get_examples(), 10): + examples = self._ensure_ents(islice(get_examples(), 10)) + for eg in examples: doc = eg.x - if self.use_gold_ents: - ents, _ = eg.get_aligned_ents_and_ner() - doc.ents = ents doc_sample.append(doc) vector_sample.append(self.model.ops.alloc1f(nO)) assert len(doc_sample) > 0, Errors.E923.format(name=self.name) @@ -354,31 +381,17 @@ def update( losses.setdefault(self.name, 0.0) if not examples: return losses + examples = self._ensure_ents(examples) validate_examples(examples, "EntityLinker.update") - set_dropout_rate(self.model, drop) - docs = [eg.predicted for eg in examples] - # save to restore later - old_ents = [doc.ents for doc in docs] - - for doc, ex in zip(docs, examples): - if self.use_gold_ents: - ents, _ = ex.get_aligned_ents_and_ner() - doc.ents = ents - else: - # only keep matching ents - doc.ents = ex.get_matching_ents() - # make sure we have something to learn from, if not, short-circuit if not self.batch_has_learnable_example(examples): return losses + set_dropout_rate(self.model, drop) + docs = [eg.predicted for eg in examples] sentence_encodings, bp_context = self.model.begin_update(docs) - # now restore the ents - for doc, old in zip(docs, old_ents): - doc.ents = old - loss, d_scores = self.get_loss( sentence_encodings=sentence_encodings, examples=examples ) @@ -386,11 +399,13 @@ def update( if sgd is not None: self.finish_update(sgd) losses[self.name] += loss + return losses def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d): validate_examples(examples, "EntityLinker.get_loss") entity_encodings = [] + # We assume that get_loss is called with gold ents set in the examples if need be eidx = 0 # indices in gold entities to keep keep_ents = [] # indices in sentence_encodings to keep diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 00771a0f0f8..5e50a4d2801 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -717,7 +717,7 @@ def test_preserving_links_ents_2(nlp): # fmt: on -def test_overfitting_IO(): +def test_overfitting_IO_gold_entities(): # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly nlp = English() vector_length = 3 @@ -744,7 +744,9 @@ def create_kb(vocab): return mykb # Create the Entity Linker component and add it to the pipeline - entity_linker = nlp.add_pipe("entity_linker", last=True) + entity_linker = nlp.add_pipe( + "entity_linker", last=True, config={"use_gold_ents": True} + ) assert isinstance(entity_linker, EntityLinker) entity_linker.set_kb(create_kb) assert "Q2146908" in entity_linker.vocab.strings @@ -807,6 +809,107 @@ def create_kb(vocab): assert_equal(batch_deps_1, batch_deps_2) assert_equal(batch_deps_1, no_batch_deps) + eval = nlp.evaluate(train_examples) + assert "nel_macro_p" in eval + assert "nel_macro_r" in eval + assert "nel_macro_f" in eval + assert "nel_micro_p" in eval + assert "nel_micro_r" in eval + assert "nel_micro_f" in eval + assert "nel_f_per_type" in eval + assert "PERSON" in eval["nel_f_per_type"] + + assert eval["nel_macro_f"] > 0 + assert eval["nel_micro_f"] > 0 + + +def test_overfitting_IO_with_ner(): + # Simple test to try and overfit the NER and NEL component in combination - ensuring the ML models work correctly + nlp = English() + vector_length = 3 + assert "Q2146908" not in nlp.vocab.strings + + # Convert the texts to docs to make sure we have doc.ents set for the training examples + train_examples = [] + for text, annotation in TRAIN_DATA: + doc = nlp(text) + train_examples.append(Example.from_dict(doc, annotation)) + + def create_kb(vocab): + # create artificial KB - assign same prior weight to the two russ cochran's + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + mykb = InMemoryLookupKB(vocab, entity_vector_length=vector_length) + mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) + mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) + mykb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.5, 0.5], + ) + return mykb + + # Create the NER and EL components and add them to the pipeline + ner = nlp.add_pipe("ner", first=True) + entity_linker = nlp.add_pipe( + "entity_linker", last=True, config={"use_gold_ents": False} + ) + entity_linker.set_kb(create_kb) + + train_examples = [] + for text, annotations in TRAIN_DATA: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + for ent in annotations.get("entities"): + ner.add_label(ent[2]) + optimizer = nlp.initialize() + + # train the NER and NEL pipes + for i in range(50): + losses = {} + nlp.update(train_examples, sgd=optimizer, losses=losses) + assert losses["ner"] < 0.001 + assert losses["entity_linker"] < 0.001 + + # adding additional components that are required for the entity_linker + nlp.add_pipe("sentencizer", first=True) + + # test the trained model + test_text = "Russ Cochran captured his first major title with his son as caddie." + doc = nlp(test_text) + ents = doc.ents + assert len(ents) == 1 + assert ents[0].text == "Russ Cochran" + assert ents[0].label_ == "PERSON" + assert ents[0].kb_id_ != "NIL" + + # TODO: below assert is still flaky - EL doesn't properly overfit quite yet + # assert ents[0].kb_id_ == "Q2146908" + + # Also test the results are still the same after IO + with make_tempdir() as tmp_dir: + nlp.to_disk(tmp_dir) + nlp2 = util.load_model_from_path(tmp_dir) + assert nlp2.pipe_names == nlp.pipe_names + doc2 = nlp2(test_text) + ents2 = doc2.ents + assert len(ents2) == 1 + assert ents2[0].text == "Russ Cochran" + assert ents2[0].label_ == "PERSON" + assert ents2[0].kb_id_ != "NIL" + + eval = nlp.evaluate(train_examples) + assert "nel_macro_f" in eval + assert "nel_micro_f" in eval + assert "ents_f" in eval + assert "nel_f_per_type" in eval + assert "ents_per_type" in eval + assert "PERSON" in eval["nel_f_per_type"] + assert "PERSON" in eval["ents_per_type"] + + assert eval["nel_macro_f"] > 0 + assert eval["nel_micro_f"] > 0 + assert eval["ents_f"] > 0 + def test_kb_serialization(): # Test that the KB can be used in a pipeline with a different vocab diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 21d2e9015ce..c7b11985aea 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -61,7 +61,7 @@ architectures and their arguments and hyperparameters. | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | | `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~bool~~ | | `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | | `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | | `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | From 6d6c10ab9c2ff1059fdb062c4421a2ddd6c40c04 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 29 Apr 2024 10:18:07 +0200 Subject: [PATCH 03/29] Fix CI (#13469) * Remove hardcoded architecture setting * update classifiers to include Python 3.12 --- .github/workflows/tests.yml | 2 -- .github/workflows/universe_validation.yml | 1 - setup.cfg | 1 + 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 840b8e5f968..2a236b6bd3e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -31,7 +31,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.7" - architecture: x64 - name: black run: | @@ -81,7 +80,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python_version }} - architecture: x64 - name: Install dependencies run: | diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index a1e3253a9ba..4d492500c57 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -26,7 +26,6 @@ jobs: uses: actions/setup-python@v4 with: python-version: "3.7" - architecture: x64 - name: Validate website/meta/universe.json run: | diff --git a/setup.cfg b/setup.cfg index a6b14eb0676..899e808cb04 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ classifiers = Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases From 74836524e3372a158ecc42ba49b10a0baad975d4 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 29 Apr 2024 10:36:31 +0200 Subject: [PATCH 04/29] Bump to v5 (#13470) --- .github/workflows/lock.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml index 6c3985a930a..2bbdd64c771 100644 --- a/.github/workflows/lock.yml +++ b/.github/workflows/lock.yml @@ -16,7 +16,7 @@ jobs: if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: - - uses: dessant/lock-threads@v4 + - uses: dessant/lock-threads@v5 with: process-only: 'issues' issue-inactive-days: '30' From 045cd43c3f8a2c2529393b464085809e995b6e8f Mon Sep 17 00:00:00 2001 From: Alex Strick van Linschoten Date: Mon, 29 Apr 2024 11:10:17 +0200 Subject: [PATCH 05/29] Fix typos in docs (#13466) * fix typos * prettier formatting --------- Co-authored-by: svlandeg --- spacy/cli/find_threshold.py | 4 +- spacy/tests/test_language.py | 2 +- website/docs/api/attributes.mdx | 60 ++++++++++----------- website/docs/api/cli.mdx | 4 +- website/docs/api/entitylinker.mdx | 32 +++++------ website/docs/api/entityruler.mdx | 6 +-- website/docs/api/span.mdx | 2 +- website/docs/api/transformer.mdx | 2 +- website/docs/usage/layers-architectures.mdx | 2 +- website/docs/usage/linguistic-features.mdx | 2 +- website/docs/usage/projects.mdx | 4 +- website/docs/usage/saving-loading.mdx | 11 ++-- website/docs/usage/v2-2.mdx | 2 +- website/docs/usage/v3-2.mdx | 2 +- 14 files changed, 69 insertions(+), 66 deletions(-) diff --git a/spacy/cli/find_threshold.py b/spacy/cli/find_threshold.py index 48077fa511d..3e86495e7c1 100644 --- a/spacy/cli/find_threshold.py +++ b/spacy/cli/find_threshold.py @@ -39,7 +39,7 @@ def find_threshold_cli( # fmt: on ): """ - Runs prediction trials for a trained model with varying tresholds to maximize + Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` @@ -81,7 +81,7 @@ def find_threshold( silent: bool = True, ) -> Tuple[float, float, Dict[float, float]]: """ - Runs prediction trials for models with varying tresholds to maximize the specified metric. + Runs prediction trials for models with varying thresholds to maximize the specified metric. model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory. data_path (Path): Path to file with DocBin with docs to use for threshold search. pipe_name (str): Name of pipe to examine thresholds for. diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index d229739e1ee..ee707f7931f 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process): nlp.set_error_handler(raise_error) with pytest.raises(ValueError): list(nlp.pipe(texts, n_process=n_process)) - # set explicitely to ignoring + # set explicitly to ignoring nlp.set_error_handler(ignore_error) docs = list(nlp.pipe(texts, n_process=n_process)) assert len(docs) == 0 diff --git a/website/docs/api/attributes.mdx b/website/docs/api/attributes.mdx index 3142b741d9a..9cb76ac5842 100644 --- a/website/docs/api/attributes.mdx +++ b/website/docs/api/attributes.mdx @@ -45,33 +45,33 @@ For attributes that represent string values, the internal integer ID is accessed as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by appending `_` as in `token.dep_`. -| Attribute | Description | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `DEP` | The token's dependency label. ~~str~~ | -| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | -| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | -| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | -| `ENT_TYPE` | The token's entity label. ~~str~~ | -| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | -| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | -| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | -| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | -| `IS_PUNCT` | Token is punctuation. ~~bool~~ | -| `IS_SPACE` | Token is whitespace. ~~bool~~ | -| `IS_STOP` | Token is a stop word. ~~bool~~ | -| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | -| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | -| `LEMMA` | The token's lemma. ~~str~~ | -| `LENGTH` | The length of the token text. ~~int~~ | -| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | -| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | -| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | -| `LOWER` | The lowercase form of the token text. ~~str~~ | -| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | -| `NORM` | The normalized form of the token text. ~~str~~ | -| `ORTH` | The exact verbatim text of a token. ~~str~~ | -| `POS` | The token's universal part of speech (UPOS). ~~str~~ | -| `SENT_START` | Token is start of sentence. ~~bool~~ | -| `SHAPE` | The token's shape. ~~str~~ | -| `SPACY` | Token has a trailing space. ~~bool~~ | -| `TAG` | The token's fine-grained part of speech. ~~str~~ | +| Attribute | Description | +| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `DEP` | The token's dependency label. ~~str~~ | +| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ | +| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ | +| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ | +| `ENT_TYPE` | The token's entity label. ~~str~~ | +| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ | +| `IS_ASCII` | Token text consists of ASCII characters. ~~bool~~ | +| `IS_DIGIT` | Token text consists of digits. ~~bool~~ | +| `IS_LOWER` | Token text is in lowercase. ~~bool~~ | +| `IS_PUNCT` | Token is punctuation. ~~bool~~ | +| `IS_SPACE` | Token is whitespace. ~~bool~~ | +| `IS_STOP` | Token is a stop word. ~~bool~~ | +| `IS_TITLE` | Token text is in titlecase. ~~bool~~ | +| `IS_UPPER` | Token text is in uppercase. ~~bool~~ | +| `LEMMA` | The token's lemma. ~~str~~ | +| `LENGTH` | The length of the token text. ~~int~~ | +| `LIKE_EMAIL` | Token text resembles an email address. ~~bool~~ | +| `LIKE_NUM` | Token text resembles a number. ~~bool~~ | +| `LIKE_URL` | Token text resembles a URL. ~~bool~~ | +| `LOWER` | The lowercase form of the token text. ~~str~~ | +| `MORPH` | The token's morphological analysis. ~~MorphAnalysis~~ | +| `NORM` | The normalized form of the token text. ~~str~~ | +| `ORTH` | The exact verbatim text of a token. ~~str~~ | +| `POS` | The token's universal part of speech (UPOS). ~~str~~ | +| `SENT_START` | Token is start of sentence. ~~bool~~ | +| `SHAPE` | The token's shape. ~~str~~ | +| `SPACY` | Token has a trailing space. ~~bool~~ | +| `TAG` | The token's fine-grained part of speech. ~~str~~ | diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 950d98c1f68..6c47c8f1e89 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== @@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] ## find-threshold {id="find-threshold",version="3.5",tag="command"} -Runs prediction trials for a trained model with varying tresholds to maximize +Runs prediction trials for a trained model with varying thresholds to maximize the specified metric. The search space for the threshold is traversed linearly from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout` (the corresponding API call to `spacy.cli.find_threshold.find_threshold()` diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index c7b11985aea..f4b83d88bbf 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -67,7 +67,7 @@ architectures and their arguments and hyperparameters. | `generate_empty_kb` 3.5.1 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | | `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | | `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py @@ -100,21 +100,21 @@ custom knowledge base, you should either call [`set_kb`](/api/entitylinker#set_kb) or provide a `kb_loader` in the [`initialize`](/api/entitylinker#initialize) call. -| Name | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Name | Description | +| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `entity_vector_length` | Size of encoding vectors in the KB. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ## EntityLinker.\_\_call\_\_ {id="call",tag="method"} diff --git a/website/docs/api/entityruler.mdx b/website/docs/api/entityruler.mdx index 27624398ec6..335e87676c7 100644 --- a/website/docs/api/entityruler.mdx +++ b/website/docs/api/entityruler.mdx @@ -58,7 +58,7 @@ how the component should be configured. You can override its settings via the | Setting | Description | | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | @@ -92,7 +92,7 @@ be a token pattern (list) or a phrase pattern (string). For example: | `name` 3 | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ | | _keyword-only_ | | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ | -| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | +| `matcher_fuzzy_compare` 3.5 | The fuzzy comparison method, passed on to the internal `Matcher`. Defaults to `spacy.matcher.levenshtein.levenshtein_compare`. ~~Callable~~ | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"\|\|"`. ~~str~~ | @@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized with `overwrite_ents=True`, existing entities will be replaced if they overlap with the matches. When matches overlap in a Doc, the entity ruler prioritizes -longer patterns over shorter, and if equal the match occuring first in the Doc +longer patterns over shorter, and if equal the match occurring first in the Doc is chosen. > #### Example diff --git a/website/docs/api/span.mdx b/website/docs/api/span.mdx index 41422a5b4e1..225ff6e6acd 100644 --- a/website/docs/api/span.mdx +++ b/website/docs/api/span.mdx @@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative clauses. If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data) -has not been implemeted for the given language, a `NotImplementedError` is +has not been implemented for the given language, a `NotImplementedError` is raised. > #### Example diff --git a/website/docs/api/transformer.mdx b/website/docs/api/transformer.mdx index 8f024553dac..9dcafb55782 100644 --- a/website/docs/api/transformer.mdx +++ b/website/docs/api/transformer.mdx @@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `width` | The width of the last hidden layer. ~~int~~ | -### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"} +### TransformerData.empty {id="transformerdata-empty",tag="classmethod"} Create an empty `TransformerData` container. diff --git a/website/docs/usage/layers-architectures.mdx b/website/docs/usage/layers-architectures.mdx index 03b85f5af91..344c66e8db2 100644 --- a/website/docs/usage/layers-architectures.mdx +++ b/website/docs/usage/layers-architectures.mdx @@ -832,7 +832,7 @@ retrieve and add to them. After creation, the component needs to be [initialized](/usage/training#initialization). This method can define the -relevant labels in two ways: explicitely by setting the `labels` argument in the +relevant labels in two ways: explicitly by setting the `labels` argument in the [`initialize` block](/api/data-formats#config-initialize) of the config, or implicately by deducing them from the `get_examples` callback that generates the full **training data set**, or a representative sample. diff --git a/website/docs/usage/linguistic-features.mdx b/website/docs/usage/linguistic-features.mdx index 21cedd1ef2c..6ca97040780 100644 --- a/website/docs/usage/linguistic-features.mdx +++ b/website/docs/usage/linguistic-features.mdx @@ -1899,7 +1899,7 @@ the two words. "Shore": ("coast", 0.732257), "Precautionary": ("caution", 0.490973), "hopelessness": ("sadness", 0.742366), - "Continous": ("continuous", 0.732549), + "Continuous": ("continuous", 0.732549), "Disemboweled": ("corpse", 0.499432), "biostatistician": ("scientist", 0.339724), "somewheres": ("somewheres", 0.402736), diff --git a/website/docs/usage/projects.mdx b/website/docs/usage/projects.mdx index b089a7ab561..e10ba4c506c 100644 --- a/website/docs/usage/projects.mdx +++ b/website/docs/usage/projects.mdx @@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the dependency check, set `check_requirements: false` in your project's `project.yml`. -### 4. Run a workflow {id="run-workfow"} +### 4. Run a workflow {id="run-workflow"} > #### project.yml > @@ -286,7 +286,7 @@ pipelines. | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). | | `description` | An optional project description used in [auto-generated docs](#custom-docs). | -| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | +| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. | | `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. | | `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. | | `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. | diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index 9a6791d5e0a..0b0b759e914 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -306,7 +306,9 @@ installed in the same environment – that's it. ### Loading probability tables into existing models -You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`. +You can load a probability table from +[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an +existing spaCy model like `en_core_web_sm`. ```python # Requirements: pip install spacy-lookups-data @@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"]) nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob")) ``` -When training a model from scratch you can also specify probability tables in the `config.cfg`. +When training a model from scratch you can also specify probability tables in +the `config.cfg`. ```ini {title="config.cfg (excerpt)"} [initialize.lookups] @@ -346,8 +349,8 @@ them**! To stick with the theme of [this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/), consider the following custom spaCy -[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a -snake when it's called: +[pipeline component](/usage/processing-pipelines#custom-components) that prints +a snake when it's called: > #### Package directory structure > diff --git a/website/docs/usage/v2-2.mdx b/website/docs/usage/v2-2.mdx index 84129657dda..cf4f7c5bf57 100644 --- a/website/docs/usage/v2-2.mdx +++ b/website/docs/usage/v2-2.mdx @@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL' 'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC' (1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338) ✔ Good amount of examples for all labels -✔ Examples without occurences available for all labels +✔ Examples without occurrences available for all labels ✔ No entities consisting of or starting/ending with whitespace =========================== Part-of-speech Tagging =========================== diff --git a/website/docs/usage/v3-2.mdx b/website/docs/usage/v3-2.mdx index b4a4ef67242..b3ffd5d6820 100644 --- a/website/docs/usage/v3-2.mdx +++ b/website/docs/usage/v3-2.mdx @@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under `TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details in the [transformer API docs](/api/architectures#TransformerModel). -`spacy-transfomers` v1.1 also adds support for `transformer_config` settings +`spacy-transformers` v1.1 also adds support for `transformer_config` settings such as `output_attentions`. Additional output is stored under `TransformerData.model_output`. More details are in the [TransformerModel docs](/api/architectures#TransformerModel). The training speed From ecd85d2618dcedb524bc457854ab3cd8e5979f20 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 29 Apr 2024 13:28:46 +0200 Subject: [PATCH 06/29] Update Typer pin and GH actions (#13471) * update gh actions * pin typer upperbound to 1.0.0 --- .github/workflows/explosionbot.yml | 2 +- .github/workflows/slowtests.yml | 2 +- .github/workflows/spacy_universe_alert.yml | 2 +- .github/workflows/tests.yml | 4 ++-- .github/workflows/universe_validation.yml | 2 +- requirements.txt | 2 +- setup.cfg | 2 +- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml index 910cfdc40ff..78a27cfa3ba 100644 --- a/.github/workflows/explosionbot.yml +++ b/.github/workflows/explosionbot.yml @@ -15,7 +15,7 @@ jobs: env: GITHUB_CONTEXT: ${{ toJson(github) }} run: echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 - name: Install and run explosion-bot run: | diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index f9fd3e81769..17d8989faa8 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: ref: ${{ matrix.branch }} - name: Get commits from past 24 hours diff --git a/.github/workflows/spacy_universe_alert.yml b/.github/workflows/spacy_universe_alert.yml index 33851fbcc18..01731ffe0d7 100644 --- a/.github/workflows/spacy_universe_alert.yml +++ b/.github/workflows/spacy_universe_alert.yml @@ -18,7 +18,7 @@ jobs: run: | echo "$GITHUB_CONTEXT" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: actions/setup-python@v4 with: python-version: '3.10' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2a236b6bd3e..af115e817e9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 @@ -74,7 +74,7 @@ jobs: steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 diff --git a/.github/workflows/universe_validation.yml b/.github/workflows/universe_validation.yml index 4d492500c57..ce7df49dbae 100644 --- a/.github/workflows/universe_validation.yml +++ b/.github/workflows/universe_validation.yml @@ -20,7 +20,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Configure Python version uses: actions/setup-python@v4 diff --git a/requirements.txt b/requirements.txt index 54b8f22a17e..2ad92176d1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 -typer>=0.3.0,<0.10.0 +typer>=0.3.0,<1.0.0 weasel>=0.1.0,<0.5.0 # Third party dependencies numpy>=1.15.0; python_version < "3.9" diff --git a/setup.cfg b/setup.cfg index 899e808cb04..ca8f645488d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,7 @@ install_requires = catalogue>=2.0.6,<2.1.0 weasel>=0.1.0,<0.5.0 # Third-party dependencies - typer>=0.3.0,<0.10.0 + typer>=0.3.0,<1.0.0 tqdm>=4.38.0,<5.0.0 numpy>=1.15.0; python_version < "3.9" numpy>=1.19.0; python_version >= "3.9" From d3a232f773046771adc4cdfaf40343aff5872f4c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 30 Apr 2024 09:17:59 +0200 Subject: [PATCH 07/29] Update LICENSE to include 2024 (#13472) --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index 979f5ade7b4..6cb7810c6ee 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016-2023 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal +Copyright (C) 2016-2024 ExplosionAI GmbH, 2016 spaCy GmbH, 2015 Matthew Honnibal Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From c195ca4f9ce98812eb7febc3043e212492ffc07a Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 2 May 2024 16:46:41 +0200 Subject: [PATCH 08/29] fix docs for MorphAnalysis.__contains__ (#13433) --- website/docs/api/morphology.mdx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/website/docs/api/morphology.mdx b/website/docs/api/morphology.mdx index 018ce25245e..7f6802034d2 100644 --- a/website/docs/api/morphology.mdx +++ b/website/docs/api/morphology.mdx @@ -147,9 +147,10 @@ Whether a feature/value pair is in the analysis. > assert "Feat1=Val1" in morph > ``` -| Name | Description | -| ----------- | --------------------------------------------- | -| **RETURNS** | A feature/value pair in the analysis. ~~str~~ | +| Name | Description | +| ------------ | --------------------------------------------------------------------- | +| `feature` | A feature/value pair. ~~str~~ | +| **RETURNS** | Whether the feature/value pair is contained in the analysis. ~~bool~~ | ### MorphAnalysis.\_\_iter\_\_ {id="morphanalysis-iter",tag="method"} From 82fc2ecfa521f6c2aa8e77d2750b81a0c90e4580 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 15 May 2024 12:11:33 +0200 Subject: [PATCH 09/29] Bump version to 3.7.5 (#13493) --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index f5ee66dae6f..b7fd3751ac5 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.4" +__version__ = "3.7.5" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From a6d0fc3602b611e4d4a6fcc4f41cbea114ad13f8 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 31 May 2024 19:20:46 +0200 Subject: [PATCH 10/29] Remove typing-extensions from requirements (#13516) --- requirements.txt | 1 - setup.cfg | 1 - 2 files changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2ad92176d1d..7e7144d536f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 -typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" # Development dependencies pre-commit>=2.13.0 cython>=0.25,<3.0 diff --git a/setup.cfg b/setup.cfg index ca8f645488d..2917f67eddc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,7 +66,6 @@ install_requires = # Official Python utilities setuptools packaging>=20.0 - typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 [options.entry_points] From f78e5ce73290f39e7de6c2f5ec5c360fdd12fcee Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 21 Jun 2024 14:32:00 +0200 Subject: [PATCH 11/29] Disable extra CI --- .github/workflows/{gputests.yml => gputests.yml.disabled} | 0 .github/workflows/{slowtests.yml => slowtests.yml.disabled} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{gputests.yml => gputests.yml.disabled} (100%) rename .github/workflows/{slowtests.yml => slowtests.yml.disabled} (100%) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml.disabled similarity index 100% rename from .github/workflows/gputests.yml rename to .github/workflows/gputests.yml.disabled diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml.disabled similarity index 100% rename from .github/workflows/slowtests.yml rename to .github/workflows/slowtests.yml.disabled From 8cda27aefaea64e899061564cdedd85f0fa269e5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 26 Jun 2024 09:41:23 +0200 Subject: [PATCH 12/29] Add case study [ci skip] --- website/src/templates/index.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/src/templates/index.js b/website/src/templates/index.js index fad12f4c847..754cf47bf58 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => { } const navAlert = ( - - 💥 Interested in Premium spaCy Models? + + 💥 New: Case study with S&P Global ) From a8accc33964443422dd5096557d2c98d38ce5b57 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 20 Aug 2024 12:15:05 +0200 Subject: [PATCH 13/29] Use cibuildwheel to build wheels (#13603) * Add workflow files for cibuildwheel * Add config for cibuildwheel * Set version for experimental prerelease * Try updating cython * Skip 32-bit windows builds * Revert "Try updating cython" This reverts commit c1b794ab5c48daa7ea4eaef816bf6b4a13e7b4f9. * Try to import cibuildwheel settings from previous setup --- .github/workflows/cibuildwheel.yml | 92 ++++++++++++++++++++++++++++++ .github/workflows/publish_pypi.yml | 29 ++++++++++ pyproject.toml | 53 +++++++++++++++++ spacy/about.py | 2 +- 4 files changed, 175 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/cibuildwheel.yml create mode 100644 .github/workflows/publish_pypi.yml diff --git a/.github/workflows/cibuildwheel.yml b/.github/workflows/cibuildwheel.yml new file mode 100644 index 00000000000..c5676ce49f1 --- /dev/null +++ b/.github/workflows/cibuildwheel.yml @@ -0,0 +1,92 @@ +name: Build + +on: + push: + tags: + # ytf did they invent their own syntax that's almost regex? + # ** matches 'zero or more of any character' + - 'release-v[0-9]+.[0-9]+.[0-9]+**' + - 'prerelease-v[0-9]+.[0-9]+.[0-9]+**' +jobs: + build_wheels: + name: Build wheels on ${{ matrix.os }} + runs-on: ${{ matrix.os }} + strategy: + matrix: + # macos-13 is an intel runner, macos-14 is apple silicon + os: [ubuntu-latest, windows-latest, macos-13] + + steps: + - uses: actions/checkout@v4 + - name: Build wheels + uses: pypa/cibuildwheel@v2.19.1 + env: + CIBW_SOME_OPTION: value + with: + package-dir: . + output-dir: wheelhouse + config-file: "{package}/pyproject.toml" + - uses: actions/upload-artifact@v4 + with: + name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }} + path: ./wheelhouse/*.whl + + build_sdist: + name: Build source distribution + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build sdist + run: pipx run build --sdist + - uses: actions/upload-artifact@v4 + with: + name: cibw-sdist + path: dist/*.tar.gz + create_release: + needs: [build_wheels, build_sdist] + runs-on: ubuntu-latest + permissions: + contents: write + checks: write + actions: read + issues: read + packages: write + pull-requests: read + repository-projects: read + statuses: read + steps: + - name: Get the tag name and determine if it's a prerelease + id: get_tag_info + run: | + FULL_TAG=${GITHUB_REF#refs/tags/} + if [[ $FULL_TAG == release-* ]]; then + TAG_NAME=${FULL_TAG#release-} + IS_PRERELEASE=false + elif [[ $FULL_TAG == prerelease-* ]]; then + TAG_NAME=${FULL_TAG#prerelease-} + IS_PRERELEASE=true + else + echo "Tag does not match expected patterns" >&2 + exit 1 + fi + echo "FULL_TAG=$TAG_NAME" >> $GITHUB_ENV + echo "TAG_NAME=$TAG_NAME" >> $GITHUB_ENV + echo "IS_PRERELEASE=$IS_PRERELEASE" >> $GITHUB_ENV + - uses: actions/download-artifact@v4 + with: + # unpacks all CIBW artifacts into dist/ + pattern: cibw-* + path: dist + merge-multiple: true + - name: Create Draft Release + id: create_release + uses: softprops/action-gh-release@v2 + if: startsWith(github.ref, 'refs/tags/') + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + name: ${{ env.TAG_NAME }} + draft: true + prerelease: ${{ env.IS_PRERELEASE }} + files: "./dist/*" diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml new file mode 100644 index 00000000000..9f432874cc2 --- /dev/null +++ b/.github/workflows/publish_pypi.yml @@ -0,0 +1,29 @@ +# The cibuildwheel action triggers on creation of a release, this +# triggers on publication. +# The expected workflow is to create a draft release and let the wheels +# upload, and then hit 'publish', which uploads to PyPi. + +on: + release: + types: + - published + +jobs: + upload_pypi: + runs-on: ubuntu-latest + environment: + name: pypi + url: https://pypi.org/p/spacy + permissions: + id-token: write + contents: read + if: github.event_name == 'release' && github.event.action == 'published' + # or, alternatively, upload to PyPI on every tag starting with 'v' (remove on: release above to use this) + # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + steps: + - uses: robinraju/release-downloader@v1 + with: + tag: ${{ github.event.release.tag_name }} + fileName: '*' + out-file-path: 'dist' + - uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/pyproject.toml b/pyproject.toml index bfd7e68d1f7..07ffe167792 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,5 +11,58 @@ requires = [ ] build-backend = "setuptools.build_meta" +[tool.cibuildwheel] +build = "*" +skip = "pp* cp36* cp37* cp38* *-win32" +test-skip = "" +free-threaded-support = false + +archs = ["native"] + +build-frontend = "default" +config-settings = {} +dependency-versions = "pinned" +environment = { PIP_CONSTRAINT = "build-constraints.txt" } + +environment-pass = [] +build-verbosity = 0 + +before-all = "curl https://sh.rustup.rs -sSf | sh -s -- -y --profile minimal --default-toolchain stable" +before-build = "pip install -r requirements.txt && python setup.py clean" +repair-wheel-command = "" + +test-command = "" +before-test = "" +test-requires = [] +test-extras = [] + +container-engine = "docker" + +manylinux-x86_64-image = "manylinux2014" +manylinux-i686-image = "manylinux2014" +manylinux-aarch64-image = "manylinux2014" +manylinux-ppc64le-image = "manylinux2014" +manylinux-s390x-image = "manylinux2014" +manylinux-pypy_x86_64-image = "manylinux2014" +manylinux-pypy_i686-image = "manylinux2014" +manylinux-pypy_aarch64-image = "manylinux2014" + +musllinux-x86_64-image = "musllinux_1_2" +musllinux-i686-image = "musllinux_1_2" +musllinux-aarch64-image = "musllinux_1_2" +musllinux-ppc64le-image = "musllinux_1_2" +musllinux-s390x-image = "musllinux_1_2" + +[tool.cibuildwheel.linux] +repair-wheel-command = "auditwheel repair -w {dest_dir} {wheel}" + +[tool.cibuildwheel.macos] +repair-wheel-command = "delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel}" + +[tool.cibuildwheel.windows] + +[tool.cibuildwheel.pyodide] + + [tool.isort] profile = "black" diff --git a/spacy/about.py b/spacy/about.py index b7fd3751ac5..4c912714610 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.5" +__version__ = "3.7.6a" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 319e02545cff2d1c4d4b3fd5e7fb91cabcceb706 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 20 Aug 2024 12:16:08 +0200 Subject: [PATCH 14/29] Set version to 3.7.6 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 4c912714610..23ef181ebca 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.6a" +__version__ = "3.7.6" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 55db9c2e87a03fe1389da2f201f3d7412aa92a11 Mon Sep 17 00:00:00 2001 From: Mark Liberko <163771866+markliberko@users.noreply.github.com> Date: Mon, 9 Sep 2024 04:14:09 -0500 Subject: [PATCH 15/29] Added gd language folder (#13570) Implemented a foundational Scottish Gaelic (gd) language option with tokenizer_exceptions and stop_words files. --- spacy/lang/gd/__init__.py | 18 + spacy/lang/gd/stop_words.py | 386 +++++ spacy/lang/gd/tokenizer_exceptions.py | 1982 +++++++++++++++++++++++++ 3 files changed, 2386 insertions(+) create mode 100644 spacy/lang/gd/__init__.py create mode 100644 spacy/lang/gd/stop_words.py create mode 100644 spacy/lang/gd/tokenizer_exceptions.py diff --git a/spacy/lang/gd/__init__.py b/spacy/lang/gd/__init__.py new file mode 100644 index 00000000000..048a3a07183 --- /dev/null +++ b/spacy/lang/gd/__init__.py @@ -0,0 +1,18 @@ +from typing import Optional + +from ...language import BaseDefaults, Language +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS + + +class ScottishDefaults(BaseDefaults): + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + stop_words = STOP_WORDS + + +class Scottish(Language): + lang = "gd" + Defaults = ScottishDefaults + + +__all__ = ["Scottish"] diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py new file mode 100644 index 00000000000..d5132c35e31 --- /dev/null +++ b/spacy/lang/gd/stop_words.py @@ -0,0 +1,386 @@ +STOP_WORDS = set( + """ +'ad +'ar +'d # iad +'g # ag +'ga +'gam +'gan +'gar +'gur +'m # am +'n # an +'n seo +'na +'nad +'nam +'nan +'nar +'nuair +'nur +'s +'sa +'san +'sann +'se +'sna +a +a' +a'd # agad +a'm # agam +a-chèile +a-seo +a-sin +a-siud +a chionn +a chionn 's +a chèile +a chéile +a dh' +a h-uile +a seo +ac' # aca +aca +aca-san +acasan +ach +ag +agad +agad-sa +agads' +agadsa +agaibh +agaibhse +againn +againne +agam +agam-sa +agams' +agamsa +agus +aice +aice-se +aicese +aig +aig' # aige +aige +aige-san +aigesan +air +air-san +air neo +airsan +am +an +an seo +an sin +an siud +an uair +ann +ann a +ann a' +ann a shin +ann am +ann an +annad +annam +annam-s' +annamsa +anns +anns an +annta +aon +ar +as +asad +asda +asta +b' +bho +bhon +bhuaidhe # bhuaithe +bhuainn +bhuaipe +bhuaithe +bhuapa +bhur +brì +bu +c'à +car son +carson +cha +chan +chionn +choir +chon +chun +chèile +chéile +chòir +cia mheud +ciamar +co-dhiubh +cuide +cuin +cuin' +cuine +cà +cà' +càil +càit +càit' +càite +cò +cò mheud +có +d' +da +de +dh' +dha +dhaibh +dhaibh-san +dhaibhsan +dhan +dhasan +dhe +dhen +dheth +dhi +dhiom +dhiot +dhith +dhiubh +dhomh +dhomh-s' +dhomhsa +dhu'sa # dhut-sa +dhuibh +dhuibhse +dhuinn +dhuinne +dhuit +dhut +dhutsa +dhut-sa +dhà +dhà-san +dhàsan +dhòmhsa +diubh +do +docha +don +dà +dè +dè mar +dé +dé mar +dòch' +dòcha +e +eadar +eatarra +eatorra +eile +esan +fa +far +feud +fhad +fheudar +fhearr +fhein +fheudar +fheàrr +fhèin +fhéin +fhìn +fo +fodha +fodhainn +foipe +fon +fèin +ga +gach +gam +gan +ge brith +ged +gu +gu dè +gu ruige +gun +gur +gus +i +iad +iadsan +innte +is +ise +le +leam +leam-sa +leamsa +leat +leat-sa +leatha +leatsa +leibh +leis +leis-san +leoth' +leotha +leotha-san +linn +m' +m'a +ma +mac +man +mar +mas +mathaid +mi +mis' +mise +mo +mu +mu 'n +mun +mur +mura +mus +na +na b' +na bu +na iad +nach +nad +nam +nan +nar +nas +neo +no +nuair +o +o'n +oir +oirbh +oirbh-se +oirnn +oirnne +oirre +on +orm +orm-sa +ormsa +orra +orra-san +orrasan +ort +os +r' +ri +ribh +rinn +ris +rithe +rithe-se +rium +rium-sa +riums' +riumsa +riut +riuth' +riutha +riuthasan +ro +ro'n +roimh +roimhe +romhainn +romham +romhpa +ron +ruibh +ruinn +ruinne +sa +san +sann +se +seach +seo +seothach +shin +sibh +sibh-se +sibhse +sin +sineach +sinn +sinne +siod +siodach +siud +siudach +sna # ann an +sè +t' +tarsaing +tarsainn +tarsuinn +thar +thoigh +thro +thu +thuc' +thuca +thugad +thugaibh +thugainn +thugam +thugamsa +thuice +thuige +thus' +thusa +timcheall +toigh +toil +tro +tro' # troimh +troimh +troimhe +tron +tu +tusa +uair +ud +ugaibh +ugam-s' +ugam-sa +uice +uige +uige-san +umad +unnta # ann an +ur +urrainn +à +às +àsan +á +ás +è +ì +ò +ó +""".split("\n") +) diff --git a/spacy/lang/gd/tokenizer_exceptions.py b/spacy/lang/gd/tokenizer_exceptions.py new file mode 100644 index 00000000000..bf47bd85950 --- /dev/null +++ b/spacy/lang/gd/tokenizer_exceptions.py @@ -0,0 +1,1982 @@ +from ...symbols import NORM, ORTH +from ...util import update_exc +from ..tokenizer_exceptions import BASE_EXCEPTIONS + +""" + All rules and exceptions were taken from the "Gaelic Orthographic Conventions +of 2009" (GOC) and from the "Annotated Reference Corpus of Scottish Gaelic" (ARCOSG). I did +my best to ensure this tokenizer would lead to text as close as possible to the +tokenization of the ARCOSG and the conventions in the GOC. + + +ARCOSG: https://github.com/Gaelic-Algorithmic-Research-Group/ARCOSG +GOC: https://www.gaidhlig.scot/wp-content/uploads/2021/03/GOC-2009-English.pdf +""" + +# Compound words +_exc = { + "càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}], + "sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}], + "orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}], + "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}] +} + + +# Hyphenations that are alternative forms of words +for exc_data in [ + {ORTH: "fa-near",NORM: "fainear"}, + {ORTH: "Fa-near",NORM: "Fainear"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + + + +# Abreviations and shortened words +for exc_data in [ + {ORTH: "'", NORM: "a"}, + {ORTH: "'S", NORM: "Agus"}, + {ORTH: "'s", NORM: "agus"}, + {ORTH: "B'", NORM: "Bu"}, + {ORTH: "b'", NORM: "bu"}, + {ORTH: "D'", NORM: "Do"}, + {ORTH: "d'", NORM: "do"}, + {ORTH: "'M", NORM: "Am"}, + {ORTH: "'m", NORM: "am"}, + {ORTH: "M'", NORM: "Mo"}, + {ORTH: "m'", NORM: "mo"}, + {ORTH: "'n", NORM: "an"}, + {ORTH: "'N", NORM: "An"}, + {ORTH: "Th'", NORM: "Tha"}, + {ORTH: "th'", NORM: "tha"}, +]: + _exc[exc_data[ORTH]] = [exc_data] + + +# Words with a leading apostrophe +for orth in """ + 'ac + 'Ac + 'ad + 'Ad + 'ar + 'Ar + 'bhuannachd + 'Bhuannachd + 'd + 'D + 'eil + 'Eil + 'eug + 'Eug + 'g + 'G + 'ga + 'Ga + 'gad + 'Gad + 'gam + 'Gam + 'gan + 'Gan + 'gar + 'Gar + 'gur + 'Gur + 'ic + 'Ic + 'il + 'Il + 'ill' + 'Ill' + 'ille + 'Ille + 'illean + 'Illean + 'iodh + 'Iodh + 'l + 'L + 'm + 'M + 'n + 'N + 'na + 'Na + 'nad + 'Nad + 'nam + 'Nam + 'nan + 'Nan + 'nar + 'Nar + 'neil + 'Neil + 'nise + 'Nise + 'nuair + 'Nuair + 'nur + 'Nur + 's + 'S + 'sa + 'Sa + 'sa' + 'Sa' + 'san + 'San + 'sann + 'Sann + 'se + 'Se + 'sna + 'Sna + 'son + 'Son + 'urchaidh + 'Urchaidh + """.split(): + _exc[orth] = [{ORTH: orth}] + +# Words with a trailing or middling apostrophe +for orth in """ + a' + A' + a'd + A'd + a'm + A'm + a's + A's + ac' + Ac' + agads' + Agads' + agams' + Agams' + aig' + Aig' + annams' + Annams' + ars' + Ars' + b' + B' + ball' + Ball' + bioraicht' + Bioraicht' + bh' + Bh' + bhail' + Bhail' + bhall' + Bhall' + bheath' + Bheath' + bhliadhn' + Bhliadhn' + bliadhn' + Bliadhn' + bonnant' + Bonnant' + brist' + Brist' + bàt' + Bàt' + c'à + C'à + camp' + Camp' + chalp' + Chalp' + champ' + Champ' + chomhairl' + Chomhairl' + chual' + Chual' + chuimhn' + Chuimhn' + colaisd' + Colaisd' + comhl' + Comhl' + comhairl' + Comhairl' + creids' + Creids' + cual' + Cual' + cuimhn' + Cuimhn' + cuin' + Cuin' + cà' + Cà' + càit' + Càit' + d' + D' + d'readh + D'readh + d'reaghadh + D'reaghadh + daoin' + Daoin' + deimhinn' + Deimhinn' + de'n + De'n + dh' + Dh' + dhaib' + Dhaib' + dhaoin' + Dhaoin' + dhòmhs' + Dhòmhs' + dhu'sa + Dhu'sa + dhuin' + Dhuin' + do'n + Do'n + duin' + Duin' + dòch' + Dòch' + dùint' + Dùint' + eil' + Eil' + f'a + F'a + fac' + Fac' + fad' + Fad' + fhac' + Fhac' + fhad' + Fhad' + fhaid' + Fhaid' + fhaisg' + Fhaisg' + fhiosd' + Fhiosd' + fàilt' + Fàilt' + g' + G' + gàir' + Gàir' + ghill' + Ghill' + gill' + Gill' + inns' + Inns' + innt' + Innt' + ionnsaicht' + Ionnsaicht' + leams' + Leams' + leoth' + Leoth' + lobht' + Lobht' + m' + M' + m'a + M'a + m's + M's + mhuth' + Mhuth' + mhòr' + Mhòr' + mis' + Mis' + mu'n + Mu'n + mòr' + Mòr' + oirr' + Oirr' + o'n + O'n + phàp' + Phàp' + pàp' + Pàp' + pòs' + Pòs' + prionns' + Prionns' + r' + R' + riums' + Riums' + riuth' + Riuth' + ro'n + Ro'n + sa' + Sa' + sgoil' + Sgoil' + sgìr' + Sgìr' + sheòrs' + Sheòrs' + sin' + Sin' + stall' + Stall' + sìod' + Sìod' + sònraicht' + Sònraicht' + t' + T' + taigh' + Taigh' + tein' + Tein' + teoth' + Teoth' + th' + Th' + thoilicht' + Thoilicht' + thuc' + Thuc' + thuigs' + Thuigs' + thus' + Thus' + thàna' + Thàna' + toilicht' + Toilicht' + tro' + Tro' + uisg' + Uisg' + àit' + Àit' + òg' + Òg' + """.split(): + _exc[orth] = [{ORTH: orth}] + + +# Hyphenations that should remain as single tokens +for orth in """ +'n-dràsda +'N-dràsda +-bhliadhn' +-bhliadhn' +a-bhos +A-bhos +a-bhòn-dè +A-bhòn-dè +a-cheart +A-cheart +a-chèile +A-chèile +a-deas +A-deas +a-mach +A-mach +a-mhàin +A-mhàin +a-muigh +A-muigh +a-màireach +A-màireach +a-nall +A-nall +a-neist +A-neist +a-ni +A-ni +a-nis +A-nis +a-nisd +A-nisd +a-nise +A-nise +a-nist +A-nist +a-niste +A-niste +a-nochd +A-nochd +a-nuas +A-nuas +a-null +A-null +a-raoir +A-raoir +a-riamh +A-riamh +a-rithist +A-rithist +a-rèiste +A-rèiste +a-rìs +A-rìs +a-seo +A-seo +a-sin +A-sin +a-sineach +A-sineach +a-siud +A-siud +a-staigh +A-staigh +a-steach +A-steach +a-tuath +A-tuath +aca-san +Aca-san +agad-sa +Agad-sa +agam-sa +Agam-sa +aghaidh-bhualaich +Aghaidh-bhualaich +aice-se +Aice-se +aige-san +Aige-san +ainmeannan-àite +Ainmeannan-àite +air-san +Air-san +am-bliadhna +Am-bliadhna +am-màireach +Am-màireach +amp-head +Amp-head +an-diugh +An-diugh +an-dràsd +An-dràsd +an-dràsda +An-dràsda +an-dràst +An-dràst +an-dràsta +An-dràsta +an-dè +An-dè +an-dé +An-dé +an-nise +An-nise +an-nochd +An-nochd +an-raoir +An-raoir +an-uiridh +An-uiridh +an-àbhaisteach +An-àbhaisteach +an-àird +An-àird +an-àirde +An-àirde +an-àrda +An-àrda +ana-ceartas +Ana-ceartas +ana-seo +Ana-seo +ana-sin +Ana-sin +ana-siud +Ana-siud +annam-s' +Annam-s' +ao-coltach +Ao-coltach +aobhar-sa +Aobhar-sa +aois-léinn +Aois-léinn +aona-ghnothaich +Aona-ghnothaich +ar-a-mach +Ar-a-mach +ard-easbaig +Ard-easbaig +ard-luchd-poilitics +Ard-luchd-poilitics +ath-bhaile +Ath-bhaile +ath-bheòthachadh +Ath-bheòthachadh +ath-bhliadhna +Ath-bhliadhna +ath-ghiollachd +Ath-ghiollachd +ath-nuadhais +Ath-nuadhais +ath-sgrùdadh +Ath-sgrùdadh +ath-thriop +Ath-thriop +athair-san +Athair-san +baile-ciùird +Baile-ciùird +ball-coise +Ball-coise +ball-pàrlamaid +Ball-pàrlamaid +ball-sampaill +Ball-sampaill +balla-mara +Balla-mara +ban-chompanach +Ban-chompanach +ban-fhuamhaire +Ban-fhuamhaire +ban-ghillìosach +Ban-ghillìosach +ban-righ'nn +Ban-righ'nn +ban-rìgh +Ban-rìgh +bana-bhàird +Bana-bhàird +bana-chompanaich +Bana-chompanaich +bana-phòsda +Bana-phòsda +banas-taighe +Banas-taighe +beairt-fhuaigheil +Beairt-fhuaigheil +beairt-fuaigheil +Beairt-fuaigheil +bean-gairm +Bean-gairm +bean-phòsta +Bean-phòsta +bean-taighe +Bean-taighe +beul-aithris +Beul-aithris +beò-shlàint +Beò-shlàint +beò-shlàint' +Beò-shlàint' +beò-shlàinte +Beò-shlàinte +bhaga-sgoil +Bhaga-sgoil +bhall-pàrlamaid +Bhall-pàrlamaid +bhan-chompanach +Bhan-chompanach +bhan-dòmhnallach +Bhan-dòmhnallach +bhan-phrionnsa +Bhan-phrionnsa +bhan-righinn +Bhan-righinn +bhan-sheinneadair +Bhan-sheinneadair +bharr-iall +Bharr-iall +bhata-làidir +Bhata-làidir +bhath-room +Bhath-room +bheachd-sa +Bheachd-sa +bheachd-san +Bheachd-san +bheairt-fhighe +Bheairt-fhighe +bheairtean-fuaigheil +Bheairtean-fuaigheil +bheinn-sheilg +Bheinn-sheilg +bheul-aithris +Bheul-aithris +bheò-ghlacadh +Bheò-ghlacadh +bhith-beò +Bhith-beò +bhithinn-sa +Bhithinn-sa +bhogsa-chiùil +Bhogsa-chiùil +bhonn-stéidh +Bhonn-stéidh +bhràithrean-sa +Bhràithrean-sa +bhuain-mhòine +Bhuain-mhòine +bhun-sheòrsa +Bhun-sheòrsa +bhàn-righinn +Bhàn-righinn +bhàn-rinn +Bhàn-rinn +bhàn-rìgh +Bhàn-rìgh +bhàta-aiseig +Bhàta-aiseig +bhàta-sa +Bhàta-sa +bird-watcher +Bird-watcher +bith-beò +Bith-beò +bithinn-sa +Bithinn-sa +bliadhna-sa +Bliadhna-sa +bogha-saighead +Bogha-saighead +boma-peatroil +Boma-peatroil +bristeadh-a-mach +Bristeadh-a-mach +buidhean-cathrannais +Buidhean-cathrannais +buille-a-mach +Buille-a-mach +buille-shaor +Buille-shaor +bun-adhbharan +Bun-adhbharan +bun-chomharraidhean +Bun-chomharraidhean +bun-fhiosrachadh +Bun-fhiosrachadh +bun-sgoil +Bun-sgoil +bun-stèidh +Bun-stèidh +bàt-aiseig +Bàt-aiseig +bàta-aiseig +Bàta-aiseig +bàta-bathair +Bàta-bathair +cainnt-san +Cainnt-san +cal-mac +Cal-mac +carraighean-cuimhne +Carraighean-cuimhne +cead-telebhisean +Cead-telebhisean +ceann-cinnidh +Ceann-cinnidh +ceann-suidhe +Ceann-suidhe +chanain-sa +Chanain-sa +chaolas-arcach +Chaolas-arcach +charge-adh +Charge-adh +cheala-deug +Cheala-deug +chealla-deug +Chealla-deug +cheann-cinnidh +Cheann-cinnidh +cheann-feadhna +Cheann-feadhna +cheann-suidhe +Cheann-suidhe +chearc-fhraoich +Chearc-fhraoich +chearcall-meadhain +Chearcall-meadhain +chearcall-mheadhain +Chearcall-mheadhain +chlann-nighean +Chlann-nighean +chlàr-ama +Chlàr-ama +chlò-bhuaileadh +Chlò-bhuaileadh +chlò-bhualadh +Chlò-bhualadh +cho-chreutairean +Cho-chreutairean +cho-dhùin +Cho-dhùin +cho-dhùnadh +Cho-dhùnadh +cho-dhùnaidhean +Cho-dhùnaidhean +cho-fhaireachdainn +Cho-fhaireachdainn +cho-labhairt +Cho-labhairt +cho-obraiche +Cho-obraiche +cho-roinn +Cho-roinn +chom-pàirt +Chom-pàirt +chorra-ghritheach +Chorra-ghritheach +chrann-snàth +Chrann-snàth +chreach-s' +Chreach-s' +chrith-thalmhainn +Chrith-thalmhainn +chàch-a-chéile +Chàch-a-chéile +cinn-chuspair +Cinn-chuspair +cinn-iùil +Cinn-iùil +cion-doighe +Cion-doighe +clachan-meallain +Clachan-meallain +clann-sgoile +Clann-sgoile +claon-fhaireachdainn +Claon-fhaireachdainn +claon-shamhail +Claon-shamhail +cluicheadairean-meadhain +Cluicheadairean-meadhain +clàran-ama +Clàran-ama +cléir-seanchain +Cléir-seanchain +clò-bhualadair +Clò-bhualadair +clò-bhualadh +Clò-bhualadh +co-aimsireach +Co-aimsireach +co-bhanntachd +Co-bhanntachd +co-bhuannachd +Co-bhuannachd +co-buannachd +Co-buannachd +co-cheangail +Co-cheangail +co-cheangailte +Co-cheangailte +co-cheangal +Co-cheangal +co-chreutairean +Co-chreutairean +co-chruinneachadh +Co-chruinneachadh +co-dhiu +Co-dhiu +co-dhiubh +Co-dhiubh +co-dhiù +Co-dhiù +co-dhiùbh +Co-dhiùbh +co-dhùnadh +Co-dhùnadh +co-dhùnaidhean +Co-dhùnaidhean +co-fhaireachadh +Co-fhaireachadh +co-fhaireachdainn +Co-fhaireachdainn +co-impirean +Co-impirean +co-ionad +Co-ionad +co-ionann +Co-ionann +co-labhairt +Co-labhairt +co-labhairtean +Co-labhairtean +co-obrachadh +Co-obrachadh +co-sheirm +Co-sheirm +co-theacs +Co-theacs +coimeas-meudachd +Coimeas-meudachd +cola-deug +Cola-deug +com-pàirt +Com-pàirt +cope-adh +Cope-adh +crann-aodaich +Crann-aodaich +crann-snàth +Crann-snàth +crann-tarsainn +Crann-tarsainn +craobh-sgaoileadh +Craobh-sgaoileadh +crith-thalmhainn +Crith-thalmhainn +cruth-rannsachadh +Cruth-rannsachadh +cuid-eigin +Cuid-eigin +cumail-san +Cumail-san +cur-gu-buil +Cur-gu-buil +cur-seachad +Cur-seachad +cur-seachadan +Cur-seachadan +cìs-comhairle +Cìs-comhairle +cò-dhunadh +Cò-dhunadh +còmhlan-ciùil +Còmhlan-ciùil +cùis-lagh +Cùis-lagh +cùl-chàineadh +Cùl-chàineadh +cùl-shleamhnach +Cùl-shleamhnach +cùl-taic +Cùl-taic +da-rìreabh +Da-rìreabh +da-rìreadh +Da-rìreadh +da-rìribh +Da-rìribh +deagh-ghean +Deagh-ghean +dearg-fhuileach +Dearg-fhuileach +deireadh-sheachdain +Deireadh-sheachdain +deoch-làidir +Deoch-làidir +dha-rìreabh +Dha-rìreabh +dha-rìribh +Dha-rìribh +dhaibh-san +Dhaibh-san +dhe-salin-adh +Dhe-salin-adh +dhe-salt-adh +Dhe-salt-adh +dheidhinn-sa +Dheidhinn-sa +dhol-sìos +Dhol-sìos +dhomh-s' +Dhomh-s' +dhuine-dubh +Dhuine-dubh +dhà-san +Dhà-san +dhòigh-beatha +Dhòigh-beatha +di-sathairne +Di-sathairne +dian-amharc +Dian-amharc +dlùth-cheangal +Dlùth-cheangal +do-chreidsinneach +Do-chreidsinneach +do-labhairt +Do-labhairt +do-sheachant' +Do-sheachant' +dol-a-mach +Dol-a-mach +dol-air-adhart +Dol-air-adhart +dubh-chàineadh +Dubh-chàineadh +dubh-ghorm +Dubh-ghorm +dà-chultarach +Dà-chultarach +dà-reug +Dà-reug +dàn-mòr +Dàn-mòr +dì-moladh +Dì-moladh +dòigh-beatha +Dòigh-beatha +dòighean-beatha +Dòighean-beatha +e-mail +E-mail +eadar-dhealachadh +Eadar-dhealachadh +eadar-dhealachaidhean +Eadar-dhealachaidhean +eadar-dhealaichte +Eadar-dhealaichte +eadar-nàiseanta +Eadar-nàiseanta +earbainn-s +Earbainn-s +eàrr-ràdh +Eàrr-ràdh +eòrp-innseanach +Eòrp-innseanach +fa-leth +Fa-leth +fa-near +Fa-near +fad-as +Fad-as +fad-thréimhseach +Fad-thréimhseach +feadaig-mhonaidh +Feadaig-mhonaidh +fealla-dhà +Fealla-dhà +fear-a-ropa +Fear-a-ropa +fear-ceasnachaidh +Fear-ceasnachaidh +fear-faire +Fear-faire +fear-gairm +Fear-gairm +fear-glèidhidh +Fear-glèidhidh +fear-labhairt +Fear-labhairt +fear-naidheachd +Fear-naidheachd +fear-pòsta +Fear-pòsta +fear-sgrùdaidh +Fear-sgrùdaidh +fear-teagaisg +Fear-teagaisg +fear-trèinidh +Fear-trèinidh +fear-éisteachd +Fear-éisteachd +feed-adh +Feed-adh +fhear-ghlèidhidh +Fhear-ghlèidhidh +fhear-gleidhidh +Fhear-gleidhidh +fhear-glèidhidh +Fhear-glèidhidh +fhear-labhairt +Fhear-labhairt +fhear-leughaidh +Fhear-leughaidh +fhear-sa +Fhear-sa +fhear-sgrùdaidh +Fhear-sgrùdaidh +fhir-cinnidh +Fhir-cinnidh +fhéin-ìomhaigh +Fhéin-ìomhaigh +fhìor-luachmhor +Fhìor-luachmhor +fois-fhòirneirt +Fois-fhòirneirt +fàs-bheairtean +Fàs-bheairtean +féin-mhisneachd +Féin-mhisneachd +féin-mholadh +Féin-mholadh +fìor-thàbhachdach +Fìor-thàbhachdach +ge-ta +Ge-ta +ge-tà +Ge-tà +ged-tà +Ged-tà +geàrr-chunntais +Geàrr-chunntais +geàrr-chunntas +Geàrr-chunntas +geàrr-thréimhseach +Geàrr-thréimhseach +ghuth-thàmh +Ghuth-thàmh +glain'-amhairc +Glain'-amhairc +glas-ghuib +Glas-ghuib +gnàth-bhriathrachas +Gnàth-bhriathrachas +gàrradh-crìche +Gàrradh-crìche +h- +H- +h-ana-miannaibh +H-ana-miannaibh +h-uile +H-uile +hó-ró +Hó-ró +iar-mhinistear +Iar-mhinistear +inneal-spreadhaidh +Inneal-spreadhaidh +ionad-còmhnaidh +Ionad-còmhnaidh +join-adh +Join-adh +latha-an-diugh +Latha-an-diugh +leam-sa +Leam-sa +leas-adh +Leas-adh +lease-adh +Lease-adh +leat-sa +Leat-sa +leotha-san +Leotha-san +leth-char +Leth-char +leth-cheud +Leth-cheud +leth-ghàidhealtachd +Leth-ghàidhealtachd +leth-pocannan +Leth-pocannan +leth-sgeulan +Leth-sgeulan +leth-uair +Leth-uair +leughadh-ne +Leughadh-ne +lighiche-sprèidh +Lighiche-sprèidh +linn-an-òir +Linn-an-òir +litir-aonta +Litir-aonta +loma-làn +Loma-làn +lost-s' +Lost-s' +luchd-altram +Luchd-altram +luchd-altruim +Luchd-altruim +luchd-amhairc +Luchd-amhairc +luchd-ciùil +Luchd-ciùil +luchd-cruinneachaidh +Luchd-cruinneachaidh +luchd-dìon +Luchd-dìon +luchd-ealain +Luchd-ealain +luchd-einnseanaraidh +Luchd-einnseanaraidh +luchd-glèidhteachais +Luchd-glèidhteachais +luchd-gnìomhachais +Luchd-gnìomhachais +luchd-iomairt +Luchd-iomairt +luchd-lagh +Luchd-lagh +luchd-lagha +Luchd-lagha +luchd-leanmhainn +Luchd-leanmhainn +luchd-litreachais +Luchd-litreachais +luchd-obrach +Luchd-obrach +luchd-reic +Luchd-reic +luchd-sgrùdaidh +Luchd-sgrùdaidh +luchd-teagaisg +Luchd-teagaisg +luchd-turais +Luchd-turais +luchd-éisdeachd +Luchd-éisdeachd +luchd-éisteachd +Luchd-éisteachd +là-an-diugh +Là-an-diugh +làmh-chuideachaidh +Làmh-chuideachaidh +làmh-sgrìobhainn +Làmh-sgrìobhainn +làmh-sgrìobhainnean +Làmh-sgrìobhainnean +làmh-sgrìobhta +Làmh-sgrìobhta +làn-bheachd +Làn-bheachd +làn-ghàidhealtachd +Làn-ghàidhealtachd +làn-thuigse +Làn-thuigse +làn-ùine +Làn-ùine +làrna-mhàireach +Làrna-mhàireach +lìn-bheaga +Lìn-bheaga +lùth-chleasan +Lùth-chleasan +ma-ta +Ma-ta +ma-tha +Ma-tha +ma-thà +Ma-thà +ma-tà +Ma-tà +mac-an-duine +Mac-an-duine +mac-léinn +Mac-léinn +mac-meanmna +Mac-meanmna +maighstir-sgoile +Maighstir-sgoile +maor-chladaich +Maor-chladaich +maor-fearainn +Maor-fearainn +mar-thà +Mar-thà +marbh-riaghailt +Marbh-riaghailt +meadhan-aoiseil +Meadhan-aoiseil +meadhan-latha +Meadhan-latha +meadhan-oidhche +Meadhan-oidhche +meal-an-naidheachd +Meal-an-naidheachd +mean-fhàs +Mean-fhàs +mhac-meanmna +Mhac-meanmna +mheadhain-latha +Mheadhain-latha +mheadhain-oidhche +Mheadhain-oidhche +mheadhan-oidhche +Mheadhan-oidhche +mheantraiginn-sa +Mheantraiginn-sa +mhi-rùn +Mhi-rùn +mhic-an-duine +Mhic-an-duine +mhoraltachd-sa +Mhoraltachd-sa +mhuir-làn +Mhuir-làn +mhuir-sgèin +Mhuir-sgèin +mhàthair-san +Mhàthair-san +mhì-chinnt +Mhì-chinnt +mhì-chneasda +Mhì-chneasda +mhì-chòrdadh +Mhì-chòrdadh +mhì-riaraichte +Mhì-riaraichte +mhì-shocair +Mhì-shocair +mhòr-chuid +Mhòr-chuid +mhòr-shluagh +Mhòr-shluagh +mhòr-shluaigh +Mhòr-shluaigh +mhór-amharas +Mhór-amharas +mhór-chuid +Mhór-chuid +mhór-shluaigh +Mhór-shluaigh +mi-chneasda +Mi-chneasda +mi-rùn +Mi-rùn +mic-léinn +Mic-léinn +mion-chànain +Mion-chànain +mion-fhios +Mion-fhios +mion-fhiosrach +Mion-fhiosrach +mion-sgrùdadh +Mion-sgrùdadh +muir-meadhon-thireach +Muir-meadhon-thireach +mèinnean-talmhainn +Mèinnean-talmhainn +mì-chinnt +Mì-chinnt +mì-choltach +Mì-choltach +mì-dhòigh +Mì-dhòigh +mì-fhair +Mì-fhair +mì-fhortanach +Mì-fhortanach +mì-laghail +Mì-laghail +mì-nàdarra +Mì-nàdarra +mì-nàdarrach +Mì-nàdarrach +mì-rùin +Mì-rùin +mì-shealbhach +Mì-shealbhach +mì-thlachd +Mì-thlachd +mòr-shluagh +Mòr-shluagh +mór-bhuannachd +Mór-bhuannachd +mór-chuid +Mór-chuid +mór-roinn +Mór-roinn +n- +N- +neach-casaid +Neach-casaid +neach-cathrach +Neach-cathrach +neach-gairm +Neach-gairm +neo-chiontach +Neo-chiontach +neo-eisimeileach +Neo-eisimeileach +neo-iomlan +Neo-iomlan +neo-àbhaisteach +Neo-àbhaisteach +nua-bhàrdachd +Nua-bhàrdachd +nì-eigin +Nì-eigin +obair-sa +Obair-sa +oifigear-stiùiridh +Oifigear-stiùiridh +oirbh-se +Oirbh-se +ola-thruis +Ola-thruis +orm-sa +Orm-sa +orra-san +Orra-san +phiuthar-chéile +Phiuthar-chéile +phort-adhair +Phort-adhair +phump-adh +Phump-adh +phàipeir-naidheachd +Phàipeir-naidheachd +phòcaid-thòine +Phòcaid-thòine +pole-aichean +Pole-aichean +port-adhair +Port-adhair +proove-adh +Proove-adh +pàipear-naidheachd +Pàipear-naidheachd +pàipearan-naidheachd +Pàipearan-naidheachd +radio-beò +Radio-beò +rithe-se +Rithe-se +rium-sa +Rium-sa +ro-chumhang +Ro-chumhang +ro-eòlach +Ro-eòlach +ro-innleachd +Ro-innleachd +ro-làimh +Ro-làimh +ro-shealladh +Ro-shealladh +roth-thoisich +Roth-thoisich +rèidio-beò +Rèidio-beò +rùm-cùil +Rùm-cùil +sadadh-a-steach +Sadadh-a-steach +samhradh-a-chaidh +Samhradh-a-chaidh +saor-làithean +Saor-làithean +sead-fhighe +Sead-fhighe +sean-ghnàthas +Sean-ghnàthas +seana-bhliadhn' +Seana-bhliadhn' +seirbhis-aisig +Seirbhis-aisig +seòl-mara +Seòl-mara +seòmar-cadail +Seòmar-cadail +sgeulachdan-gaisge +Sgeulachdan-gaisge +sgoil-marcaidheachd +Sgoil-marcaidheachd +sgìr-easbaig +Sgìr-easbaig +sgìre-easbaig +Sgìre-easbaig +sheann-fhasanta +Sheann-fhasanta +shlatan-connaidh +Shlatan-connaidh +shon-sa +Shon-sa +shàr-sgoilear +Shàr-sgoilear +sibh-se +Sibh-se +snodha-gàire +Snodha-gàire +so-labhairt +So-labhairt +soch-mhalairteach +Soch-mhalairteach +spor-gunna +Spor-gunna +sàr-bheachdan +Sàr-bheachdan +sìor-dhol +Sìor-dhol +sùil-air-ais +Sùil-air-ais +sùil-mhara +Sùil-mhara +t- +T- +taigh-cuibhle +Taigh-cuibhle +taigh-céilidh +Taigh-céilidh +taigh-sa +Taigh-sa +taigh-sheinnse +Taigh-sheinnse +taigh-tasgaidh +Taigh-tasgaidh +taigh-tughaidh +Taigh-tughaidh +taigh-òsda +Taigh-òsda +taigh-òsta +Taigh-òsta +taighean-aoigheachd +Taighean-aoigheachd +taobh-sa +Taobh-sa +teachd-an-tìr +Teachd-an-tìr +teaghlach-chànanan +Teaghlach-chànanan +thaicean-airgid +Thaicean-airgid +thaighean-altraim +Thaighean-altraim +thonn-gheal +Thonn-gheal +thuigse-san +Thuigse-san +tigh-croiteir +Tigh-croiteir +tigh-còmhnaidh +Tigh-còmhnaidh +tigh-seinnse +Tigh-seinnse +tigh-sheinnse +Tigh-sheinnse +tighearnan-fearainn +Tighearnan-fearainn +togail-cridhe +Togail-cridhe +travel-adh +Travel-adh +triob-sa +Triob-sa +tro-chèile +Tro-chèile +troimh-a-chéile +Troimh-a-chéile +troimh-chèile +Troimh-chèile +troimhe-chéile +Troimhe-chéile +tuathanas-éisg +Tuathanas-éisg +tè-labhairt +Tè-labhairt +tìr-mhóir +Tìr-mhóir +tìr-mòr +Tìr-mòr +ugam-s' +Ugam-s' +ugam-sa +Ugam-sa +uige-san +Uige-san +uile-gu-lèir +Uile-gu-lèir +uile-tuigseach +Uile-tuigseach +use-agadh +Use-agadh +watch-adh +Watch-adh +weld-adh +Weld-adh +àrd-cheannard +Àrd-cheannard +àrd-chomhairliche +Àrd-chomhairliche +àrd-chonstabal +Àrd-chonstabal +àrd-dhuine +Àrd-dhuine +àrd-ionmhair +Àrd-ionmhair +àrd-oifigear +Àrd-oifigear +àrd-oifigeir +Àrd-oifigeir +àrd-sgoil +Àrd-sgoil +àrd-ìre +Àrd-ìre +àrd-ùrlair +Àrd-ùrlair +àrd-ùrlar +Àrd-ùrlar +às-creideach +Às-creideach +àtha-cheilpe +Àtha-cheilpe +ìre-sa +Ìre-sa +ìre-se +Ìre-se +òg-mhios +Òg-mhios +òige-sa +Òige-sa +òrd-mhòr +Òrd-mhòr""".split(): + _exc[orth] = [{ORTH: orth}] + +# Multiple words that should remain as one token +for orth in """'n diugh +'N diugh +'n dà +'N dà +'n iar +'N iar +'n seo +'N seo +'n uairsin +'N uairsin +a a sineach +A a sineach +a b' +A b' +a bhos +A bhos +a bhàn +A bhàn +a bhòn raoir +A bhòn raoir +a bhòn uiridh +A bhòn uiridh +a bhòn-dè +A bhòn-dè +a bhòn-raoir +A bhòn-raoir +a bhòn-uiridh +A bhòn-uiridh +a bu' +A bu' +a chaoidh +A chaoidh +a cheana +A cheana +a chionn +A chionn +a chionn 's +A chionn 's +a chuile +A chuile +a chèil +A chèil +a chèile +A chèile +a chéile +A chéile +a deas +A deas +a dh' +A dh' +a h-uile +A h-uile +a mach +A mach +a muigh +A muigh +a màireach +A màireach +a nall +A nall +a neisd +A neisd +a nis +A nis +a nisd +A nisd +a nise +A nise +a niste +A niste +a nochd +A nochd +a nuas +A nuas +a null +A null +a raoir +A raoir +a riamh +A riamh +a rithist +A rithist +a s +A s +a seo +A seo +a seothach +A seothach +a shineach +A shineach +a sin +A sin +a sineach +A sineach +a staidh +A staidh +a staigh +A staigh +a steach +A steach +a stigh +A stigh +a tuath +A tuath +a uiridh +A uiridh +a' diugh +A' diugh +a' s +A' s +air bith +Air bith +air choireigin +Air choireigin +air choireigin-ach +Air choireigin-ach +air choreigin +Air choreigin +air dheireadh +Air dheireadh +air falbh +Air falbh +air neo +Air neo +air thùs +Air thùs +am a màireach muigh +Am a màireach muigh +am bliadhna +Am bliadhna +am muigh +Am muigh +an am +An am +an aodann bàn +An aodann bàn +an ath bhliadhna +An ath bhliadhna +an ath oidhch' +An ath oidhch' +an ath oidhche +An ath oidhche +an ath sheachdain +An ath sheachdain +an ath sheachdainn +An ath sheachdainn +an ath-bhliadhna +An ath-bhliadhna +an ath-oidhch' +An ath-oidhch' +an ath-oidhche +An ath-oidhche +an ath-sheachdain +An ath-sheachdain +an ath-sheachdainn +An ath-sheachdainn +an ceart-uair +An ceart-uair +an ceartuair +An ceartuair +an còmhnaidh +An còmhnaidh +an de +An de +an deas +An deas +an diugh +An diugh +an dràsda +An dràsda +an dràsta +An dràsta +an dè +An dè +an ear +An ear +an earair +An earair +an earar +An earar +an earras +An earras +an iar +An iar +an iaras +An iaras +an làrna-mhàireach +An làrna-mhàireach +an raoir +An raoir +an sean +An sean +an seo +An seo +an seothach +An seothach +an sin +An sin +an sineach +An sineach +an siod +An siod +an siud +An siud +an siudach +An siudach +an toiseach +An toiseach +an uair +An uair +an uair sin +An uair sin +an uairsin +An uairsin +an uirigh +An uirigh +an àird +An àird +an àirde +An àirde +an ìre mhath +An ìre mhath +ana nàdarra +Ana nàdarra +ann a +Ann a +ann a sheo +Ann a sheo +ann a sheothach +Ann a sheothach +ann a shin +Ann a shin +ann a shineach +Ann a shineach +ann a shiodach +Ann a shiodach +ann a shiud +Ann a shiud +ann a shiudach +Ann a shiudach +ann a' +Ann a' +ann a' shiudach +Ann a' shiudach +ann a-seo +Ann a-seo +ann a-seothach +Ann a-seothach +ann a-sin +Ann a-sin +ann a-sineach +Ann a-sineach +ann a-siud +Ann a-siud +ann am +Ann am +ann an +Ann an +ann an seo +Ann an seo +ann an shin +Ann an shin +ann an shiud +Ann an shiud +ann an sin +Ann an sin +ann an siud +Ann an siud +ann seo +Ann seo +anns a' bhad +Anns a' bhad +anns an +Anns an +ath-oidhch' +Ath-oidhch' +ban-righ 'nn +Ban-righ 'nn +bho thoiseach +Bho thoiseach +bhon 'n +Bhon 'n +bhon a' +Bhon a' +bhon an +Bhon an +bhrist ' +Bhrist ' +buille a-mach +Buille a-mach +bun os cionn +Bun os cionn +car son +Car son +ceann a tuath +Ceann a tuath +cia mheud +Cia mheud +coille chaoil +Coille chaoil +cò mheud +Cò mheud +có dhiubh +Có dhiubh +d' rachadh +D' rachadh +dhen an +Dhen an +do n +Do n +dè mar +Dè mar +dé mar +Dé mar +eilean tiridhe +Eilean tiridhe +fa leth +Fa leth +fad as +Fad as +fo dheireadh +Fo dheireadh +fon a' +Fon a' +fon an +Fon an +gar bith +Gar bith +gar bith có +Gar bith có +ge 's bith +Ge 's bith +ge b' e air bith +Ge b' e air bith +ge be +Ge be +ge brith +Ge brith +ge brì +Ge brì +gleann dail +Gleann dail +gleann ois +Gleann ois +gu bè gu dè +Gu bè gu dè +gu dè +Gu dè +gu dé +Gu dé +gu ruige +Gu ruige +ho ro gheallaidh +Ho ro gheallaidh +ma dheireadh +Ma dheireadh +ma dheireadh thall +Ma dheireadh thall +ma sgaoil +Ma sgaoil +ma tha +Ma tha +mar an ceudna +Mar an ceudna +mar bu trice +Mar bu trice +mar tha +Mar tha +meadhan aoiseil +Meadhan aoiseil +mu 'n +Mu 'n +mu chuairt +Mu chuairt +mu dheas +Mu dheas +mu dheireadh +Mu dheireadh +mu dheireadh thall +Mu dheireadh thall +mu n +Mu n +mu thràth +Mu thràth +mun a' +Mun a' +mun an +Mun an +na b' +Na b' +na bu +Na bu +na iad +Na iad +nach maireann +Nach maireann +o'n uairsin +O'n uairsin +oidhch ' +Oidhch ' +on a' +On a' +on an +On an +pholl a' ghrùthain +Pholl a' ghrùthain +roinn eorpa +Roinn eorpa +ron a' +Ron a' +ron an +Ron an +ruaidh mhònaidh +Ruaidh mhònaidh +ruith thairis +Ruith thairis +sa bhad +Sa bhad +sadadh a-mach +Sadadh a-mach +sadadh a-steach +Sadadh a-steach +sam bidh +Sam bidh +sam bith +Sam bith +srath chluaidh +Srath chluaidh +taobh a-muigh +Taobh a-muigh +taobh an ear +Taobh an ear +taobh an iar +Taobh an iar +tria san ngaoidhilcc nalbanaigh +Tria san ngaoidhilcc nalbanaigh +tron a' +Tron a' +tron an +Tron an +tuilleadh 's a chòir +Tuilleadh 's a chòir +tuilleadh sa chòir +Tuilleadh sa chòir""".split("\n"): + _exc[orth] = [{ORTH: orth}] + + +TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) From acbf2a428f06b6e9b3df111c6c1447cba9b174b0 Mon Sep 17 00:00:00 2001 From: Muzaffer Cikay Date: Mon, 9 Sep 2024 12:15:40 +0300 Subject: [PATCH 16/29] Add Kurdish Kurmanji language (#13561) * Add Kurdish Kurmanji language * Add lex_attrs --- spacy/lang/kmr/__init__.py | 15 +++ spacy/lang/kmr/examples.py | 17 ++++ spacy/lang/kmr/lex_attrs.py | 139 ++++++++++++++++++++++++++ spacy/lang/kmr/stop_words.py | 44 ++++++++ spacy/tests/lang/kmr/__init__.py | 0 spacy/tests/lang/kmr/test_text.py | 16 +++ spacy/tests/lang/test_initialize.py | 2 +- spacy/tests/tokenizer/test_explain.py | 1 + website/meta/languages.json | 6 ++ 9 files changed, 239 insertions(+), 1 deletion(-) create mode 100644 spacy/lang/kmr/__init__.py create mode 100644 spacy/lang/kmr/examples.py create mode 100644 spacy/lang/kmr/lex_attrs.py create mode 100644 spacy/lang/kmr/stop_words.py create mode 100644 spacy/tests/lang/kmr/__init__.py create mode 100644 spacy/tests/lang/kmr/test_text.py diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py new file mode 100644 index 00000000000..379696f23e6 --- /dev/null +++ b/spacy/lang/kmr/__init__.py @@ -0,0 +1,15 @@ +from .lex_attrs import LEX_ATTRS +from ...language import BaseDefaults, Language +from .stop_words import STOP_WORDS + + +class KurmanjiDefaults(BaseDefaults): + stop_words = STOP_WORDS + lex_attr_getters = LEX_ATTRS + + +class Kurmanji(Language): + lang = "kmr" + Defaults = KurmanjiDefaults + +__all__ = ["Kurmanji"] diff --git a/spacy/lang/kmr/examples.py b/spacy/lang/kmr/examples.py new file mode 100644 index 00000000000..5eb362001bf --- /dev/null +++ b/spacy/lang/kmr/examples.py @@ -0,0 +1,17 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.kmr.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future + "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years. + "Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist + "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years + "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation + "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide + "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition + "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me +] diff --git a/spacy/lang/kmr/lex_attrs.py b/spacy/lang/kmr/lex_attrs.py new file mode 100644 index 00000000000..8927ef1411c --- /dev/null +++ b/spacy/lang/kmr/lex_attrs.py @@ -0,0 +1,139 @@ +from ...attrs import LIKE_NUM + + +_num_words = [ + "sifir", + "yek", + "du", + "sê", + "çar", + "pênc", + "şeş", + "heft", + "heşt", + "neh", + "deh", + "yazde", + "dazde", + "sêzde", + "çarde", + "pazde", + "şazde", + "hevde", + "hejde", + "nozde", + "bîst", + "sî", + "çil", + "pêncî", + "şêst", + "heftê", + "heştê", + "nod", + "sed", + "hezar", + "milyon", + "milyar", +] + +_ordinal_words = [ + "yekem", + "yekemîn", + "duyem", + "duyemîn", + "sêyem", + "sêyemîn", + "çarem", + "çaremîn", + "pêncem", + "pêncemîn", + "şeşem", + "şeşemîn", + "heftem", + "heftemîn", + "heştem", + "heştemîn", + "nehem", + "nehemîn", + "dehem", + "dehemîn", + "yazdehem", + "yazdehemîn", + "dazdehem", + "dazdehemîn", + "sêzdehem", + "sêzdehemîn", + "çardehem", + "çardehemîn", + "pazdehem", + "pazdehemîn", + "şanzdehem", + "şanzdehemîn", + "hevdehem", + "hevdehemîn", + "hejdehem", + "hejdehemîn", + "nozdehem", + "nozdehemîn", + "bîstem", + "bîstemîn", + "sîyem", + "sîyemîn", + "çilem", + "çilemîn", + "pêncîyem", + "pênciyemîn", + "şêstem", + "şêstemîn", + "heftêyem", + "heftêyemîn", + "heştêyem", + "heştêyemîn", + "notem", + "notemîn", + "sedem", + "sedemîn", + "hezarem", + "hezaremîn", + "milyonem", + "milyonemîn", + "milyarem", + "milyaremîn", +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + text_lower = text.lower() + if text_lower in _num_words: + return True + + # Check ordinal number + if text_lower in _ordinal_words: + return True + + if is_digit(text_lower): + return True + + return False + + +def is_digit(text): + endings = ("em", "yem", "emîn", "yemîn") + for ending in endings: + to = len(ending) + if text.endswith(ending) and text[:-to].isdigit(): + return True + + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py new file mode 100644 index 00000000000..aee33c2b748 --- /dev/null +++ b/spacy/lang/kmr/stop_words.py @@ -0,0 +1,44 @@ +STOP_WORDS = set( + """ +û +li +bi +di +da +de +ji +ku +ew +ez +tu +em +hûn +ew +ev +min +te +wî +wê +me +we +wan +vê +vî +va +çi +kî +kê +çawa +çima +kengî +li ku +çend +çiqas +her +hin +gelek +hemû +kes +tişt +""".split() +) diff --git a/spacy/tests/lang/kmr/__init__.py b/spacy/tests/lang/kmr/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/spacy/tests/lang/kmr/test_text.py b/spacy/tests/lang/kmr/test_text.py new file mode 100644 index 00000000000..209f386ec6e --- /dev/null +++ b/spacy/tests/lang/kmr/test_text.py @@ -0,0 +1,16 @@ +import pytest + +from spacy.lang.kmr.lex_attrs import like_num + + +@pytest.mark.parametrize( + "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"] +) +def test_kmr_lex_attrs_like_number_for_ordinal(word): + assert like_num(word) + + +@pytest.mark.parametrize("word", ["deh"]) +def test_kmr_lex_attrs_capitals(word): + assert like_num(word) + assert like_num(word.upper()) diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 8a158647a69..9b9ca4834cc 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -10,7 +10,7 @@ "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv", "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa", "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn", - "tr", "tt", "uk", "ur", "xx", "yo"] + "tr", "tt", "uk", "ur", "xx", "yo", "kmr"] # fmt: on diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 78932f6539c..d8241a81c68 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -57,6 +57,7 @@ pytest.param("tr", marks=pytest.mark.slow()), pytest.param("tt", marks=pytest.mark.slow()), pytest.param("ur", marks=pytest.mark.slow()), + pytest.param("kmr", marks=pytest.mark.slow()), ] diff --git a/website/meta/languages.json b/website/meta/languages.json index d6a07809795..a3717298fe1 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -480,6 +480,12 @@ ], "example": "这是一个用于示例的句子。", "has_examples": true + }, + { + "code": "kmr", + "name": "Kurdish Kurmanji", + "example": "Ev hevokek e", + "has_examples": true } ], "licenses": [ From 608f65ce40fdcd4bee3e67a0e94638913aa54090 Mon Sep 17 00:00:00 2001 From: ykyogoku <39462750+ykyogoku@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:18:03 +0200 Subject: [PATCH 17/29] add Tibetan (#13510) --- spacy/lang/bo/__init__.py | 16 +++ spacy/lang/bo/examples.py | 16 +++ spacy/lang/bo/lex_attrs.py | 32 +++++ spacy/lang/bo/stop_words.py | 198 ++++++++++++++++++++++++++ spacy/tests/conftest.py | 3 + spacy/tests/lang/bo/__init__.py | 0 spacy/tests/lang/bo/test_text.py | 21 +++ spacy/tests/tokenizer/test_explain.py | 1 + website/meta/languages.json | 6 + 9 files changed, 293 insertions(+) create mode 100644 spacy/lang/bo/__init__.py create mode 100644 spacy/lang/bo/examples.py create mode 100644 spacy/lang/bo/lex_attrs.py create mode 100644 spacy/lang/bo/stop_words.py create mode 100644 spacy/tests/lang/bo/__init__.py create mode 100644 spacy/tests/lang/bo/test_text.py diff --git a/spacy/lang/bo/__init__.py b/spacy/lang/bo/__init__.py new file mode 100644 index 00000000000..84ef8c0861f --- /dev/null +++ b/spacy/lang/bo/__init__.py @@ -0,0 +1,16 @@ +from ...language import BaseDefaults, Language +from .lex_attrs import LEX_ATTRS +from .stop_words import STOP_WORDS + + +class TibetanDefaults(BaseDefaults): + lex_attr_getters = LEX_ATTRS + stop_words = STOP_WORDS + + +class Tibetan(Language): + lang = "bo" + Defaults = TibetanDefaults + + +__all__ = ["Tibetan"] diff --git a/spacy/lang/bo/examples.py b/spacy/lang/bo/examples.py new file mode 100644 index 00000000000..8ed9372ec2b --- /dev/null +++ b/spacy/lang/bo/examples.py @@ -0,0 +1,16 @@ +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.bo.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "དོན་དུ་རྒྱ་མཚོ་བླ་མ་ཞེས་བྱ་ཞིང༌།", + "ཏཱ་ལའི་ཞེས་པ་ནི་སོག་སྐད་ཡིན་པ་དེ་བོད་སྐད་དུ་རྒྱ་མཚོའི་དོན་དུ་འཇུག", + "སོག་པོ་ཨལ་ཐན་རྒྱལ་པོས་རྒྱལ་དབང་བསོད་ནམས་རྒྱ་མཚོར་ཆེ་བསྟོད་ཀྱི་མཚན་གསོལ་བ་ཞིག་ཡིན་ཞིང༌།", + "རྗེས་སུ་རྒྱལ་བ་དགེ་འདུན་གྲུབ་དང༌། དགེ་འདུན་རྒྱ་མཚོ་སོ་སོར་ཡང་ཏཱ་ལའི་བླ་མའི་སྐུ་ཕྲེང་དང་པོ་དང༌།", + "གཉིས་པའི་མཚན་དེ་གསོལ་ཞིང༌།༸རྒྱལ་དབང་སྐུ་ཕྲེང་ལྔ་པས་དགའ་ལྡན་ཕོ་བྲང་གི་སྲིད་དབང་བཙུགས་པ་ནས་ཏཱ་ལའི་བླ་མ་ནི་བོད་ཀྱི་ཆོས་སྲིད་གཉིས་ཀྱི་དབུ་ཁྲིད་དུ་གྱུར་ཞིང་།", + "ད་ལྟའི་བར་ཏཱ་ལའི་བླ་མ་སྐུ་ཕྲེང་བཅུ་བཞི་བྱོན་ཡོད།", +] diff --git a/spacy/lang/bo/lex_attrs.py b/spacy/lang/bo/lex_attrs.py new file mode 100644 index 00000000000..c6a593868cc --- /dev/null +++ b/spacy/lang/bo/lex_attrs.py @@ -0,0 +1,32 @@ +from ...attrs import LIKE_NUM + +# reference 1: https://en.wikipedia.org/wiki/Tibetan_numerals + +_num_words = [ + "ཀླད་ཀོར་", + "གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་", + "བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་", + "སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་", + "སྟོང་", "ཁྲི་", "ས་ཡ་", " བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་", +] + + +def like_num(text): + """ + Check if text resembles a number + """ + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace(".", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _num_words: + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/bo/stop_words.py b/spacy/lang/bo/stop_words.py new file mode 100644 index 00000000000..407242c849b --- /dev/null +++ b/spacy/lang/bo/stop_words.py @@ -0,0 +1,198 @@ +# Source: https://zenodo.org/records/10148636 + +STOP_WORDS = set( + """ +འི་ +། +དུ་ +གིས་ +སོགས་ +ཏེ +གི་ +རྣམས་ +ནི +ཀུན་ +ཡི་ +འདི +ཀྱི་ +སྙེད་ +པས་ +གཞན་ +ཀྱིས་ +ཡི +ལ +ནི་ +དང་ +སོགས +ཅིང་ +ར +དུ +མི་ +སུ་ +བཅས་ +ཡོངས་ +ལས +ཙམ་ +གྱིས་ +དེ་ +ཡང་ +མཐའ་དག་ +ཏུ་ +ཉིད་ +ས +ཏེ་ +གྱི་ +སྤྱི +དེ +ཀ་ +ཡིན་ +ཞིང་ +འདི་ +རུང་ +རང་ +ཞིག་ +སྟེ +སྟེ་ +ན་རེ +ངམ +ཤིང་ +དག་ +ཏོ +རེ་ +འང་ +ཀྱང་ +ལགས་པ +ཚུ +དོ +ཡིན་པ +རེ +ན་རེ་ +ཨེ་ +ཚང་མ +ཐམས་ཅད་ +དམ་ +འོ་ +ཅིག་ +གྱིན་ +ཡིན +ན +ཁོ་ན་ +འམ་ +ཀྱིན་ +ལོ +ཀྱིས +བས་ +ལགས་ +ཤིག +གིས +ཀི་ +སྣ་ཚོགས་ +རྣམས +སྙེད་པ +ཡིས་ +གྱི +གི +བམ་ +ཤིག་ +རེ་རེ་ +ནམ +མིན་ +ནམ་ +ངམ་ +རུ་ +འགའ་ +ཀུན +ཤས་ +ཏུ +ཡིས +གིན་ +གམ་ +འོ +ཡིན་པ་ +མིན +ལགས +གྱིས +ཅང་ +འགའ +སམ་ +ཞིག +འང +ལས་ཆེ་ +འཕྲལ་ +བར་ +རུ +དང +ཡ +འག +སམ +ཀ +ཅུང་ཟད་ +ཅིག +ཉིད +དུ་མ +མ +ཡིན་བ +འམ +མམ +དམ +དག +ཁོ་ན +ཀྱི +ལམ +ཕྱི་ +ནང་ +ཙམ +ནོ་ +སོ་ +རམ་ +བོ་ +ཨང་ +ཕྱི +ཏོ་ +ཚོ +ལ་ལ་ +ཚོ་ +ཅིང +མ་གི་ +གེ +གོ +ཡིན་ལུགས་ +རོ་ +བོ +ལགས་པ་ +པས +རབ་ +འི +རམ +བས +གཞན +སྙེད་པ་ +འབའ་ +མཾ་ +པོ +ག་ +ག +གམ +སྤྱི་ +བམ +མོ་ +ཙམ་པ་ +ཤ་སྟག་ +མམ་ +རེ་རེ +སྙེད +ཏམ་ +ངོ +གྲང་ +ཏ་རེ +ཏམ +ཁ་ +ངེ་ +ཅོག་ +རིལ་ +ཉུང་ཤས་ +གིང་ +ཚ་ +ཀྱང +""".split() +) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 7db986ab9e7..d2bc0208157 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -80,6 +80,9 @@ def bg_tokenizer(): def bn_tokenizer(): return get_lang_class("bn")().tokenizer +@pytest.fixture(scope="session") +def bo_tokenizer(): + return get_lang_class("bo")().tokenizer @pytest.fixture(scope="session") def ca_tokenizer(): diff --git a/spacy/tests/lang/bo/__init__.py b/spacy/tests/lang/bo/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/spacy/tests/lang/bo/test_text.py b/spacy/tests/lang/bo/test_text.py new file mode 100644 index 00000000000..9275c15c4af --- /dev/null +++ b/spacy/tests/lang/bo/test_text.py @@ -0,0 +1,21 @@ +import pytest + + +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("999.0", True), + ("གཅིག་", True), + ("གཉིས་", True), + ("ཀླད་ཀོར་", True), + ("བཅུ་གཅིག་", True), + ("ཁྱི་", False), + (",", False), + ], +) +def test_lex_attrs_like_number(bo_tokenizer, text, match): + tokens = bo_tokenizer(text) + assert len(tokens) == 1 + assert tokens[0].like_num == match \ No newline at end of file diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index d8241a81c68..f4752849fdd 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -18,6 +18,7 @@ pytest.param("ar", marks=pytest.mark.slow()), pytest.param("bg", marks=pytest.mark.slow()), "bn", + pytest.param("bo", marks=pytest.mark.slow()), pytest.param("ca", marks=pytest.mark.slow()), pytest.param("cs", marks=pytest.mark.slow()), pytest.param("da", marks=pytest.mark.slow()), diff --git a/website/meta/languages.json b/website/meta/languages.json index a3717298fe1..a824b7d7c81 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -31,6 +31,12 @@ "name": "Bengali", "has_examples": true }, + { + "code": "bo", + "name": "Tibetan", + "example": "འདི་ཚིག་གྲུབ་རེད།", + "has_examples": true + }, { "code": "ca", "name": "Catalan", From 1b8d560d0e665626f9cdcb89b9d2cb7b0c4ccfcb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2024 11:19:39 +0200 Subject: [PATCH 18/29] Support 'memory zones' for user memory management (#13621) Add a context manage nlp.memory_zone(), which will begin memory_zone() blocks on the vocab, string store, and potentially other components. Example usage: ``` with nlp.memory_zone(): for text in nlp.pipe(texts): do_something(doc) # do_something(doc) <-- Invalid ``` Once the memory_zone() block expires, spaCy will free any shared resources that were allocated for the text-processing that occurred within the memory_zone. If you create Doc objects within a memory zone, it's invalid to access them once the memory zone is expired. The purpose of this is that spaCy creates and stores Lexeme objects in the Vocab that can be shared between multiple Doc objects. It also interns strings. Normally, spaCy can't know when all Doc objects using a Lexeme are out-of-scope, so new Lexemes accumulate in the vocab, causing memory pressure. Memory zones solve this problem by telling spaCy "okay none of the documents allocated within this block will be accessed again". This lets spaCy free all new Lexeme objects and other data that were created during the block. The mechanism is general, so memory_zone() context managers can be added to other components that could benefit from them, e.g. pipeline components. I experimented with adding memory zone support to the tokenizer as well, for its cache. However, this seems unnecessarily complicated. It makes more sense to just stick a limit on the cache size. This lets spaCy benefit from the efficiency advantage of the cache better, because we can maintain a (bounded) cache even if only small batches of documents are being processed. --- spacy/strings.pxd | 7 +- spacy/strings.pyx | 141 +++++++++++++++--- spacy/tests/vocab_vectors/test_memory_zone.py | 36 +++++ spacy/tokenizer.pxd | 4 +- spacy/tokenizer.pyx | 12 +- spacy/vocab.pxd | 4 +- spacy/vocab.pyi | 4 + spacy/vocab.pyx | 73 ++++++--- 8 files changed, 232 insertions(+), 49 deletions(-) create mode 100644 spacy/tests/vocab_vectors/test_memory_zone.py diff --git a/spacy/strings.pxd b/spacy/strings.pxd index d22f48ba133..bd5e0f13562 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -25,5 +25,8 @@ cdef class StringStore: cdef vector[hash_t] keys cdef public PreshMap _map - cdef const Utf8Str* intern_unicode(self, str py_string) - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash) + cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient) + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) + cdef vector[hash_t] _transient_keys + cdef PreshMap _transient_map + cdef Pool _non_temp_mem diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 376a131751e..5e0bd90c6d8 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,6 +1,10 @@ # cython: infer_types=True # cython: profile=False cimport cython + +from contextlib import contextmanager +from typing import Iterator, List, Optional + from libc.stdint cimport uint32_t from libc.string cimport memcpy from murmurhash.mrmr cimport hash32, hash64 @@ -31,7 +35,7 @@ def get_string_id(key): This function optimises for convenience over performance, so shouldn't be used in tight loops. """ - cdef hash_t str_hash + cdef hash_t str_hash if isinstance(key, str): if len(key) == 0: return 0 @@ -45,8 +49,8 @@ def get_string_id(key): elif _try_coerce_to_hash(key, &str_hash): # Coerce the integral key to the expected primitive hash type. # This ensures that custom/overloaded "primitive" data types - # such as those implemented by numpy are not inadvertently used - # downsteam (as these are internally implemented as custom PyObjects + # such as those implemented by numpy are not inadvertently used + # downsteam (as these are internally implemented as custom PyObjects # whose comparison operators can incur a significant overhead). return str_hash else: @@ -119,7 +123,9 @@ cdef class StringStore: strings (iterable): A sequence of unicode strings to add to the store. """ self.mem = Pool() + self._non_temp_mem = self.mem self._map = PreshMap() + self._transient_map = None if strings is not None: for string in strings: self.add(string) @@ -152,10 +158,13 @@ cdef class StringStore: return SYMBOLS_BY_INT[str_hash] else: utf8str = self._map.get(str_hash) + if utf8str is NULL and self._transient_map is not None: + utf8str = self._transient_map.get(str_hash) else: # TODO: Raise an error instead utf8str = self._map.get(string_or_id) - + if utf8str is NULL and self._transient_map is not None: + utf8str = self._transient_map.get(str_hash) if utf8str is NULL: raise KeyError(Errors.E018.format(hash_value=string_or_id)) else: @@ -175,10 +184,46 @@ cdef class StringStore: else: return self[key] - def add(self, string): + def __reduce__(self): + strings = list(self.non_transient_keys()) + return (StringStore, (strings,), None, None, None) + + def __len__(self) -> int: + """The number of strings in the store. + + RETURNS (int): The number of strings in the store. + """ + return self._keys.size() + self._transient_keys.size() + + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Pool: + """Begin a block where all resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + """ + if mem is None: + mem = Pool() + self.mem = mem + self._transient_map = PreshMap() + yield mem + self.mem = self._non_temp_mem + self._transient_map = None + self._transient_keys.clear() + + def add(self, string: str, allow_transient: bool = False) -> int: """Add a string to the StringStore. string (str): The string to add. + allow_transient (bool): Allow the string to be stored in the 'transient' + map, which will be flushed at the end of the memory zone. Strings + encountered during arbitrary text processing should be added + with allow_transient=True, while labels and other strings used + internally should not. RETURNS (uint64): The string's hash value. """ cdef hash_t str_hash @@ -188,22 +233,26 @@ cdef class StringStore: string = string.encode("utf8") str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) + self._intern_utf8(string, len(string), &str_hash, allow_transient) elif isinstance(string, bytes): if string in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[string] str_hash = hash_utf8(string, len(string)) - self._intern_utf8(string, len(string), &str_hash) + self._intern_utf8(string, len(string), &str_hash, allow_transient) else: raise TypeError(Errors.E017.format(value_type=type(string))) return str_hash def __len__(self): """The number of strings in the store. + if string in SYMBOLS_BY_STR: + return SYMBOLS_BY_STR[string] + else: + return self._intern_str(string, allow_transient) RETURNS (int): The number of strings in the store. """ - return self.keys.size() + return self.keys.size() + self._transient_keys.size() def __contains__(self, string_or_id not None): """Check whether a string or ID is in the store. @@ -222,30 +271,70 @@ cdef class StringStore: pass else: # TODO: Raise an error instead - return self._map.get(string_or_id) is not NULL - + if self._map.get(string_or_id) is not NULL: + return True + elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL: + return True + else: + return False if str_hash < len(SYMBOLS_BY_INT): return True else: - return self._map.get(str_hash) is not NULL + if self._map.get(str_hash) is not NULL: + return True + elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL: + return True + else: + return False def __iter__(self): """Iterate over the strings in the store, in order. YIELDS (str): A string in the store. """ + yield from self.non_transient_keys() + yield from self.transient_keys() + + def non_transient_keys(self) -> Iterator[str]: + """Iterate over the stored strings in insertion order. + + RETURNS: A list of strings. + """ cdef int i cdef hash_t key for i in range(self.keys.size()): key = self.keys[i] utf8str = self._map.get(key) yield decode_Utf8Str(utf8str) - # TODO: Iterate OOV here? def __reduce__(self): strings = list(self) return (StringStore, (strings,), None, None, None) + def transient_keys(self) -> Iterator[str]: + if self._transient_map is None: + return [] + for i in range(self._transient_keys.size()): + utf8str = self._transient_map.get(self._transient_keys[i]) + yield decode_Utf8Str(utf8str) + + def values(self) -> List[int]: + """Iterate over the stored strings hashes in insertion order. + + RETURNS: A list of string hashs. + """ + cdef int i + hashes = [None] * self._keys.size() + for i in range(self._keys.size()): + hashes[i] = self._keys[i] + if self._transient_map is not None: + transient_hashes = [None] * self._transient_keys.size() + for i in range(self._transient_keys.size()): + transient_hashes[i] = self._transient_keys[i] + else: + transient_hashes = [] + return hashes + transient_hashes + def to_disk(self, path): """Save the current state to a directory. @@ -269,7 +358,7 @@ cdef class StringStore: prev = list(self) self._reset_and_load(strings) for word in prev: - self.add(word) + self.add(word, allow_transient=False) return self def to_bytes(self, **kwargs): @@ -289,7 +378,7 @@ cdef class StringStore: prev = list(self) self._reset_and_load(strings) for word in prev: - self.add(word) + self.add(word, allow_transient=False) return self def _reset_and_load(self, strings): @@ -297,22 +386,34 @@ cdef class StringStore: self._map = PreshMap() self.keys.clear() for string in strings: - self.add(string) + self.add(string, allow_transient=False) - cdef const Utf8Str* intern_unicode(self, str py_string): + cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient): # 0 means missing, but we don't bother offsetting the index. cdef bytes byte_string = py_string.encode("utf8") - return self._intern_utf8(byte_string, len(byte_string), NULL) + return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient) @cython.final - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash): + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient): # TODO: This function's API/behaviour is an unholy mess... # 0 means missing, but we don't bother offsetting the index. cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length) cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value + if allow_transient and self._transient_map is not None: + # If we've already allocated a transient string, and now we + # want to intern it permanently, we'll end up with the string + # in both places. That seems fine -- I don't see why we need + # to remove it from the transient map. + value = self._transient_map.get(key) + if value is not NULL: + return value value = _allocate(self.mem, utf8_string, length) - self._map.set(key, value) - self.keys.push_back(key) + if allow_transient and self._transient_map is not None: + self._transient_map.set(key, value) + self._transient_keys.push_back(key) + else: + self._map.set(key, value) + self.keys.push_back(key) return value diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py new file mode 100644 index 00000000000..910d2664eb4 --- /dev/null +++ b/spacy/tests/vocab_vectors/test_memory_zone.py @@ -0,0 +1,36 @@ +from spacy.vocab import Vocab + + +def test_memory_zone_no_insertion(): + vocab = Vocab() + with vocab.memory_zone(): + pass + lex = vocab["horse"] + assert lex.text == "horse" + + +def test_memory_zone_insertion(): + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab + with vocab.memory_zone(): + lex = vocab["horse"] + assert lex.text == "horse" + assert "dog" in vocab + assert "horse" not in vocab + + +def test_memory_zone_redundant_insertion(): + """Test that if we insert an already-existing word while + in the memory zone, it stays persistent""" + vocab = Vocab() + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab + with vocab.memory_zone(): + lex = vocab["horse"] + assert lex.text == "horse" + _ = vocab["dog"] + assert "dog" in vocab + assert "horse" not in vocab diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a902ebad941..88e4b06b024 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -25,9 +25,7 @@ cdef class Tokenizer: cdef PhraseMatcher _special_matcher # TODO convert to bool in v4 cdef int _faster_heuristics - # TODO next one is unused and should be removed in v4 - # https://github.com/explosion/spaCy/pull/9150 - cdef int _unused_int2 + cdef public int max_cache_size cdef Doc _tokenize_affixes(self, str string, bint with_special_cases) cdef int _apply_special_cases(self, Doc doc) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 96545828fde..93b7f63acda 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -30,7 +30,7 @@ cdef class Tokenizer: """ def __init__(self, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None, - url_match=None, faster_heuristics=True): + url_match=None, faster_heuristics=True, max_cache_size=10000): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -50,6 +50,7 @@ cdef class Tokenizer: faster_heuristics (bool): Whether to restrict the final Matcher-based pass for rules to those containing affixes or space. Defaults to True. + max_cache_size (int): Maximum number of tokenization chunks to cache. EXAMPLE: >>> tokenizer = Tokenizer(nlp.vocab) @@ -69,6 +70,7 @@ cdef class Tokenizer: self._rules = {} self._special_matcher = PhraseMatcher(self.vocab) self._load_special_cases(rules) + self.max_cache_size = max_cache_size @property def token_match(self): @@ -397,8 +399,9 @@ cdef class Tokenizer: has_special, with_special_cases) self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special, with_special_cases) - self._save_cached(&tokens.c[orig_size], orig_key, has_special, - tokens.length - orig_size) + if len(self._cache) < self.max_cache_size: + self._save_cached(&tokens.c[orig_size], orig_key, has_special, + tokens.length - orig_size) cdef str _split_affixes( self, @@ -514,6 +517,9 @@ cdef class Tokenizer: if n <= 0: # avoid mem alloc of zero length return 0 + # Historically this check was mostly used to avoid caching + # chunks that had tokens owned by the Doc. Now that that's + # not a thing, I don't think we need this? for i in range(n): if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: return 0 diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 43e47af1dee..c2bfe12e37b 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -41,7 +41,9 @@ cdef class Vocab: cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL - cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1 cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL cdef PreshMap _by_orth + cdef Pool _non_temp_mem + cdef vector[attr_t] _transient_orths diff --git a/spacy/vocab.pyi b/spacy/vocab.pyi index b7ff20348a0..ee7636f02c8 100644 --- a/spacy/vocab.pyi +++ b/spacy/vocab.pyi @@ -1,6 +1,8 @@ +from contextlib import contextmanager from pathlib import Path from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Union +from cymem.cymem import Pool from thinc.types import Floats1d, FloatsXd from . import Language @@ -67,6 +69,8 @@ class Vocab: def from_bytes( self, bytes_data: bytes, *, exclude: Iterable[str] = ... ) -> Vocab: ... + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: ... def pickle_vocab(vocab: Vocab) -> Any: ... def unpickle_vocab( diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 19e6eb005c0..97ba5d68c31 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,4 +1,6 @@ import functools +from contextlib import ExitStack, contextmanager +from typing import Iterator, Optional import numpy import srsly @@ -87,6 +89,12 @@ cdef class Vocab: self.lookups = lookups self.writing_system = writing_system self.get_noun_chunks = get_noun_chunks + # During a memory_zone we replace our mem object with one + # that's passed to us. We keep a reference to our non-temporary + # memory here, in case we need to make an allocation we want to + # guarantee is not temporary. This is also how we check whether + # we're in a memory zone: we check whether self.mem is self._non_temp_mem + self._non_temp_mem = self.mem @property def vectors(self): @@ -114,6 +122,33 @@ cdef class Vocab: """ return self.length + @contextmanager + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: + """Begin a block where resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + """ + if mem is None: + mem = Pool() + # The ExitStack allows programmatic nested context managers. + # We don't know how many we need, so it would be awkward to have + # them as nested blocks. + with ExitStack() as stack: + contexts = [stack.enter_context(self.strings.memory_zone(mem))] + if hasattr(self.morphology, "memory_zone"): + contexts.append(stack.enter_context(self.morphology.memory_zone(mem))) + if hasattr(self._vectors, "memory_zone"): + contexts.append(stack.enter_context(self._vectors.memory_zone(mem))) + self.mem = mem + yield mem + self._clear_transient_orths() + self.mem = self._non_temp_mem + def add_flag(self, flag_getter, int flag_id=-1): """Set a new boolean flag to words in the vocabulary. @@ -148,8 +183,7 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, str string) except NULL: """Get a pointer to a `LexemeC` from the lexicon, creating a new - `Lexeme` if necessary using memory acquired from the given pool. If the - pool is the lexicon's own memory, the lexeme is saved in the lexicon. + `Lexeme` if necessary. """ if string == "": return &EMPTY_LEXEME @@ -180,17 +214,9 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL: - # I think this heuristic is bad, and the Vocab should always - # own the lexemes. It avoids weird bugs this way, as it's how the thing - # was originally supposed to work. The best solution to the growing - # memory use is to periodically reset the vocab, which is an action - # that should be up to the user to do (so we don't need to keep track - # of the doc ownership). - # TODO: Change the C API so that the mem isn't passed in here. + # The mem argument is deprecated, replaced by memory zones. Same with + # this size heuristic. mem = self.mem - # if len(string) < 3 or self.length < 10000: - # mem = self.mem - cdef bint is_oov = mem is not self.mem lex = mem.alloc(1, sizeof(LexemeC)) lex.orth = self.strings.add(string) lex.length = len(string) @@ -202,18 +228,25 @@ cdef class Vocab: for attr, func in self.lex_attr_getters.items(): value = func(string) if isinstance(value, str): - value = self.strings.add(value) + value = self.strings.add(value, allow_transient=True) if value is not None: Lexeme.set_struct_attr(lex, attr, value) - if not is_oov: - self._add_lex_to_vocab(lex.orth, lex) + self._add_lex_to_vocab(lex.orth, lex, self.mem is not self._non_temp_mem) if lex == NULL: raise ValueError(Errors.E085.format(string=string)) return lex - cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: + cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1: self._by_orth.set(lex.orth, lex) self.length += 1 + if is_transient: + self._transient_orths.push_back(lex.orth) + + def _clear_transient_orths(self): + """Remove transient lexemes from the index (generally at the end of the memory zone)""" + for orth in self._transient_orths: + self._by_orth.pop(orth) + self._transient_orths.clear() def __contains__(self, key): """Check whether the string or int key has an entry in the vocabulary. @@ -265,7 +298,7 @@ cdef class Vocab: """ cdef attr_t orth if isinstance(id_or_string, str): - orth = self.strings.add(id_or_string) + orth = self.strings.add(id_or_string, allow_transient=True) else: orth = id_or_string return Lexeme(self, orth) @@ -417,7 +450,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#get_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=True) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) if self.has_vector(key): @@ -436,7 +469,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#set_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=False) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) if self.vectors.is_full and key not in self.vectors: @@ -460,7 +493,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#has_vector """ if isinstance(orth, str): - orth = self.strings.add(orth) + orth = self.strings.add(orth, allow_transient=True) cdef Lexeme lex = self[orth] key = Lexeme.get_struct_attr(lex.c, self.vectors.attr) return key in self.vectors From b65491b6412aeb635135c637be3a59cb03dc2913 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2024 11:20:23 +0200 Subject: [PATCH 19/29] Set version to v3.8.0.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 23ef181ebca..942a731948b 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -1,5 +1,5 @@ # fmt: off __title__ = "spacy" -__version__ = "3.7.6" +__version__ = "3.8.0.dev0" __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 59ac7e6bdb32b34c58e94a65b8f93fe63d4290e7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2024 11:22:52 +0200 Subject: [PATCH 20/29] Format --- spacy/lang/bo/lex_attrs.py | 41 ++++++++++++++++++++++++--- spacy/lang/gd/stop_words.py | 4 ++- spacy/lang/gd/tokenizer_exceptions.py | 23 ++++++++------- spacy/lang/kmr/__init__.py | 1 + spacy/tests/conftest.py | 2 ++ spacy/tests/lang/bo/test_text.py | 2 +- spacy/tests/lang/kmr/test_text.py | 13 ++++++++- 7 files changed, 68 insertions(+), 18 deletions(-) diff --git a/spacy/lang/bo/lex_attrs.py b/spacy/lang/bo/lex_attrs.py index c6a593868cc..5535934af1c 100644 --- a/spacy/lang/bo/lex_attrs.py +++ b/spacy/lang/bo/lex_attrs.py @@ -4,10 +4,43 @@ _num_words = [ "ཀླད་ཀོར་", - "གཅིག་", "གཉིས་", "གསུམ་", "བཞི་", "ལྔ་", "དྲུག་", "བདུན་", "བརྒྱད་", "དགུ་", "བཅུ་", - "བཅུ་གཅིག་", "བཅུ་གཉིས་", "བཅུ་གསུམ་", "བཅུ་བཞི་", "བཅུ་ལྔ་", "བཅུ་དྲུག་", "བཅུ་བདུན་", "བཅུ་པརྒྱད", "བཅུ་དགུ་", "ཉི་ཤུ་", - "སུམ་ཅུ", "བཞི་བཅུ", "ལྔ་བཅུ", "དྲུག་ཅུ", "བདུན་ཅུ", "བརྒྱད་ཅུ", "དགུ་བཅུ", "བརྒྱ་", - "སྟོང་", "ཁྲི་", "ས་ཡ་", " བྱེ་བ་", "དུང་ཕྱུར་", "ཐེར་འབུམ་", "ཐེར་འབུམ་ཆེན་པོ་", "ཁྲག་ཁྲིག་", "ཁྲག་ཁྲིག་ཆེན་པོ་", + "གཅིག་", + "གཉིས་", + "གསུམ་", + "བཞི་", + "ལྔ་", + "དྲུག་", + "བདུན་", + "བརྒྱད་", + "དགུ་", + "བཅུ་", + "བཅུ་གཅིག་", + "བཅུ་གཉིས་", + "བཅུ་གསུམ་", + "བཅུ་བཞི་", + "བཅུ་ལྔ་", + "བཅུ་དྲུག་", + "བཅུ་བདུན་", + "བཅུ་པརྒྱད", + "བཅུ་དགུ་", + "ཉི་ཤུ་", + "སུམ་ཅུ", + "བཞི་བཅུ", + "ལྔ་བཅུ", + "དྲུག་ཅུ", + "བདུན་ཅུ", + "བརྒྱད་ཅུ", + "དགུ་བཅུ", + "བརྒྱ་", + "སྟོང་", + "ཁྲི་", + "ས་ཡ་", + " བྱེ་བ་", + "དུང་ཕྱུར་", + "ཐེར་འབུམ་", + "ཐེར་འབུམ་ཆེན་པོ་", + "ཁྲག་ཁྲིག་", + "ཁྲག་ཁྲིག་ཆེན་པོ་", ] diff --git a/spacy/lang/gd/stop_words.py b/spacy/lang/gd/stop_words.py index d5132c35e31..9f5a66cbc24 100644 --- a/spacy/lang/gd/stop_words.py +++ b/spacy/lang/gd/stop_words.py @@ -382,5 +382,7 @@ ì ò ó -""".split("\n") +""".split( + "\n" + ) ) diff --git a/spacy/lang/gd/tokenizer_exceptions.py b/spacy/lang/gd/tokenizer_exceptions.py index bf47bd85950..76e169d904d 100644 --- a/spacy/lang/gd/tokenizer_exceptions.py +++ b/spacy/lang/gd/tokenizer_exceptions.py @@ -18,19 +18,18 @@ "càil": [{ORTH: "cà", NORM: "càite"}, {ORTH: "il", NORM: "bheil"}], "sna": [{ORTH: "s", NORM: "anns"}, {ORTH: "na", NORM: "na"}], "orra": [{ORTH: "orr", NORM: "air"}, {ORTH: "a", NORM: "do"}], - "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}] + "fiùs": [{ORTH: "fiù", NORM: "fiù"}, {ORTH: "s", NORM: "'s"}], } - + # Hyphenations that are alternative forms of words for exc_data in [ - {ORTH: "fa-near",NORM: "fainear"}, - {ORTH: "Fa-near",NORM: "Fainear"}, + {ORTH: "fa-near", NORM: "fainear"}, + {ORTH: "Fa-near", NORM: "Fainear"}, ]: _exc[exc_data[ORTH]] = [exc_data] - - - + + # Abreviations and shortened words for exc_data in [ {ORTH: "'", NORM: "a"}, @@ -1529,7 +1528,7 @@ Òige-sa òrd-mhòr Òrd-mhòr""".split(): - _exc[orth] = [{ORTH: orth}] + _exc[orth] = [{ORTH: orth}] # Multiple words that should remain as one token for orth in """'n diugh @@ -1975,8 +1974,10 @@ tuilleadh 's a chòir Tuilleadh 's a chòir tuilleadh sa chòir -Tuilleadh sa chòir""".split("\n"): - _exc[orth] = [{ORTH: orth}] - +Tuilleadh sa chòir""".split( + "\n" +): + _exc[orth] = [{ORTH: orth}] + TOKENIZER_EXCEPTIONS = update_exc(BASE_EXCEPTIONS, _exc) diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py index 379696f23e6..124321a8ea0 100644 --- a/spacy/lang/kmr/__init__.py +++ b/spacy/lang/kmr/__init__.py @@ -12,4 +12,5 @@ class Kurmanji(Language): lang = "kmr" Defaults = KurmanjiDefaults + __all__ = ["Kurmanji"] diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index d2bc0208157..e30300a33e6 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -80,10 +80,12 @@ def bg_tokenizer(): def bn_tokenizer(): return get_lang_class("bn")().tokenizer + @pytest.fixture(scope="session") def bo_tokenizer(): return get_lang_class("bo")().tokenizer + @pytest.fixture(scope="session") def ca_tokenizer(): return get_lang_class("ca")().tokenizer diff --git a/spacy/tests/lang/bo/test_text.py b/spacy/tests/lang/bo/test_text.py index 9275c15c4af..fb3900d51c6 100644 --- a/spacy/tests/lang/bo/test_text.py +++ b/spacy/tests/lang/bo/test_text.py @@ -18,4 +18,4 @@ def test_lex_attrs_like_number(bo_tokenizer, text, match): tokens = bo_tokenizer(text) assert len(tokens) == 1 - assert tokens[0].like_num == match \ No newline at end of file + assert tokens[0].like_num == match diff --git a/spacy/tests/lang/kmr/test_text.py b/spacy/tests/lang/kmr/test_text.py index 209f386ec6e..405dc28f62f 100644 --- a/spacy/tests/lang/kmr/test_text.py +++ b/spacy/tests/lang/kmr/test_text.py @@ -4,7 +4,18 @@ @pytest.mark.parametrize( - "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"] + "word", + [ + "yekem", + "duyemîn", + "100em", + "dehem", + "sedemîn", + "34em", + "30yem", + "20emîn", + "50yemîn", + ], ) def test_kmr_lex_attrs_like_number_for_ordinal(word): assert like_num(word) From a019315534559daccc4391ed5951f10cb715e1ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2024 13:49:41 +0200 Subject: [PATCH 21/29] Fix memory zones --- spacy/lang/kmr/__init__.py | 2 +- spacy/lang/kmr/lex_attrs.py | 1 - spacy/language.py | 35 +++++++- .../pipeline/_parser_internals/arc_eager.pyx | 2 +- spacy/pipeline/_parser_internals/nonproj.pyx | 2 +- spacy/strings.pxd | 1 - spacy/strings.pyx | 83 +++++++------------ spacy/tokenizer.pyx | 8 +- spacy/vocab.pyx | 13 ++- 9 files changed, 76 insertions(+), 71 deletions(-) diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py index 124321a8ea0..eee9e69d0dc 100644 --- a/spacy/lang/kmr/__init__.py +++ b/spacy/lang/kmr/__init__.py @@ -1,5 +1,5 @@ -from .lex_attrs import LEX_ATTRS from ...language import BaseDefaults, Language +from .lex_attrs import LEX_ATTRS from .stop_words import STOP_WORDS diff --git a/spacy/lang/kmr/lex_attrs.py b/spacy/lang/kmr/lex_attrs.py index 8927ef1411c..6b80204104d 100644 --- a/spacy/lang/kmr/lex_attrs.py +++ b/spacy/lang/kmr/lex_attrs.py @@ -1,6 +1,5 @@ from ...attrs import LIKE_NUM - _num_words = [ "sifir", "yek", diff --git a/spacy/language.py b/spacy/language.py index 18d20c93932..57b8514815a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -5,7 +5,7 @@ import random import traceback import warnings -from contextlib import contextmanager +from contextlib import ExitStack, contextmanager from copy import deepcopy from dataclasses import dataclass from itertools import chain, cycle @@ -31,6 +31,7 @@ ) import srsly +from cymem.cymem import Pool from thinc.api import Config, CupyOps, Optimizer, get_current_ops from . import about, ty, util @@ -2091,6 +2092,38 @@ def replace_listeners( util.replace_model_node(pipe.model, listener, new_model) # type: ignore[attr-defined] tok2vec.remove_listener(listener, pipe_name) + @contextmanager + def memory_zone(self, mem: Optional[Pool]=None) -> Iterator[Pool]: + """Begin a block where all resources allocated during the block will + be freed at the end of it. If a resources was created within the + memory zone block, accessing it outside the block is invalid. + Behaviour of this invalid access is undefined. Memory zones should + not be nested. + + The memory zone is helpful for services that need to process large + volumes of text with a defined memory budget. + + Example + ------- + >>> with nlp.memory_zone(): + ... for doc in nlp.pipe(texts): + ... process_my_doc(doc) + >>> # use_doc(doc) <-- Invalid: doc was allocated in the memory zone + """ + if mem is None: + mem = Pool() + # The ExitStack allows programmatic nested context managers. + # We don't know how many we need, so it would be awkward to have + # them as nested blocks. + with ExitStack() as stack: + contexts = [stack.enter_context(self.vocab.memory_zone(mem))] + if hasattr(self.tokenizer, "memory_zone"): + contexts.append(stack.enter_context(self.tokenizer.memory_zone(mem))) + for _, pipe in self.pipeline: + if hasattr(pipe, "memory_zone"): + contexts.append(stack.enter_context(pipe.memory_zone(mem))) + yield mem + def to_disk( self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList() ) -> None: diff --git a/spacy/pipeline/_parser_internals/arc_eager.pyx b/spacy/pipeline/_parser_internals/arc_eager.pyx index e1375494482..bedaaf9febd 100644 --- a/spacy/pipeline/_parser_internals/arc_eager.pyx +++ b/spacy/pipeline/_parser_internals/arc_eager.pyx @@ -203,7 +203,7 @@ cdef class ArcEagerGold: def __init__(self, ArcEager moves, StateClass stcls, Example example): self.mem = Pool() heads, labels = example.get_aligned_parse(projectivize=True) - labels = [example.x.vocab.strings.add(label) if label is not None else MISSING_DEP for label in labels] + labels = [example.x.vocab.strings.add(label, allow_transient=False) if label is not None else MISSING_DEP for label in labels] sent_starts = _get_aligned_sent_starts(example) assert len(heads) == len(labels) == len(sent_starts), (len(heads), len(labels), len(sent_starts)) self.c = create_gold_state(self.mem, stcls.c, heads, labels, sent_starts) diff --git a/spacy/pipeline/_parser_internals/nonproj.pyx b/spacy/pipeline/_parser_internals/nonproj.pyx index 7de19851e00..9e3a21b814f 100644 --- a/spacy/pipeline/_parser_internals/nonproj.pyx +++ b/spacy/pipeline/_parser_internals/nonproj.pyx @@ -183,7 +183,7 @@ cpdef deprojectivize(Doc doc): new_label, head_label = label.split(DELIMITER) new_head = _find_new_head(doc[i], head_label) doc.c[i].head = new_head.i - i - doc.c[i].dep = doc.vocab.strings.add(new_label) + doc.c[i].dep = doc.vocab.strings.add(new_label, allow_transient=False) set_children_from_heads(doc.c, 0, doc.length) return doc diff --git a/spacy/strings.pxd b/spacy/strings.pxd index bd5e0f13562..b015858581d 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -28,5 +28,4 @@ cdef class StringStore: cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient) cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) cdef vector[hash_t] _transient_keys - cdef PreshMap _transient_map cdef Pool _non_temp_mem diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 5e0bd90c6d8..b0f6cf5aa6e 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -8,6 +8,7 @@ from typing import Iterator, List, Optional from libc.stdint cimport uint32_t from libc.string cimport memcpy from murmurhash.mrmr cimport hash32, hash64 +from preshed.maps cimport map_clear import srsly @@ -125,10 +126,9 @@ cdef class StringStore: self.mem = Pool() self._non_temp_mem = self.mem self._map = PreshMap() - self._transient_map = None if strings is not None: for string in strings: - self.add(string) + self.add(string, allow_transient=False) def __getitem__(self, object string_or_id): """Retrieve a string from a given hash, or vice versa. @@ -158,17 +158,17 @@ cdef class StringStore: return SYMBOLS_BY_INT[str_hash] else: utf8str = self._map.get(str_hash) - if utf8str is NULL and self._transient_map is not None: - utf8str = self._transient_map.get(str_hash) + if utf8str is NULL: + raise KeyError(Errors.E018.format(hash_value=string_or_id)) + else: + return decode_Utf8Str(utf8str) else: # TODO: Raise an error instead utf8str = self._map.get(string_or_id) - if utf8str is NULL and self._transient_map is not None: - utf8str = self._transient_map.get(str_hash) - if utf8str is NULL: - raise KeyError(Errors.E018.format(hash_value=string_or_id)) - else: - return decode_Utf8Str(utf8str) + if utf8str is NULL: + raise KeyError(Errors.E018.format(hash_value=string_or_id)) + else: + return decode_Utf8Str(utf8str) def as_int(self, key): """If key is an int, return it; otherwise, get the int value.""" @@ -184,16 +184,12 @@ cdef class StringStore: else: return self[key] - def __reduce__(self): - strings = list(self.non_transient_keys()) - return (StringStore, (strings,), None, None, None) - def __len__(self) -> int: """The number of strings in the store. RETURNS (int): The number of strings in the store. """ - return self._keys.size() + self._transient_keys.size() + return self.keys.size() + self._transient_keys.size() @contextmanager def memory_zone(self, mem: Optional[Pool] = None) -> Pool: @@ -209,13 +205,13 @@ cdef class StringStore: if mem is None: mem = Pool() self.mem = mem - self._transient_map = PreshMap() yield mem - self.mem = self._non_temp_mem - self._transient_map = None + for key in self._transient_keys: + map_clear(self._map.c_map, key) self._transient_keys.clear() + self.mem = self._non_temp_mem - def add(self, string: str, allow_transient: bool = False) -> int: + def add(self, string: str, allow_transient: Optional[bool] = None) -> int: """Add a string to the StringStore. string (str): The string to add. @@ -226,6 +222,8 @@ cdef class StringStore: internally should not. RETURNS (uint64): The string's hash value. """ + if allow_transient is None: + allow_transient = self.mem is not self._non_temp_mem cdef hash_t str_hash if isinstance(string, str): if string in SYMBOLS_BY_STR: @@ -273,8 +271,6 @@ cdef class StringStore: # TODO: Raise an error instead if self._map.get(string_or_id) is not NULL: return True - elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL: - return True else: return False if str_hash < len(SYMBOLS_BY_INT): @@ -282,8 +278,6 @@ cdef class StringStore: else: if self._map.get(str_hash) is not NULL: return True - elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL: - return True else: return False @@ -292,32 +286,21 @@ cdef class StringStore: YIELDS (str): A string in the store. """ - yield from self.non_transient_keys() - yield from self.transient_keys() - - def non_transient_keys(self) -> Iterator[str]: - """Iterate over the stored strings in insertion order. - - RETURNS: A list of strings. - """ cdef int i cdef hash_t key for i in range(self.keys.size()): key = self.keys[i] utf8str = self._map.get(key) yield decode_Utf8Str(utf8str) + for i in range(self._transient_keys.size()): + key = self._transient_keys[i] + utf8str = self._map.get(key) + yield decode_Utf8Str(utf8str) def __reduce__(self): strings = list(self) return (StringStore, (strings,), None, None, None) - def transient_keys(self) -> Iterator[str]: - if self._transient_map is None: - return [] - for i in range(self._transient_keys.size()): - utf8str = self._transient_map.get(self._transient_keys[i]) - yield decode_Utf8Str(utf8str) - def values(self) -> List[int]: """Iterate over the stored strings hashes in insertion order. @@ -327,12 +310,9 @@ cdef class StringStore: hashes = [None] * self._keys.size() for i in range(self._keys.size()): hashes[i] = self._keys[i] - if self._transient_map is not None: - transient_hashes = [None] * self._transient_keys.size() - for i in range(self._transient_keys.size()): - transient_hashes[i] = self._transient_keys[i] - else: - transient_hashes = [] + transient_hashes = [None] * self._transient_keys.size() + for i in range(self._transient_keys.size()): + transient_hashes[i] = self._transient_keys[i] return hashes + transient_hashes def to_disk(self, path): @@ -383,8 +363,10 @@ cdef class StringStore: def _reset_and_load(self, strings): self.mem = Pool() + self._non_temp_mem = self.mem self._map = PreshMap() self.keys.clear() + self._transient_keys.clear() for string in strings: self.add(string, allow_transient=False) @@ -401,19 +383,10 @@ cdef class StringStore: cdef Utf8Str* value = self._map.get(key) if value is not NULL: return value - if allow_transient and self._transient_map is not None: - # If we've already allocated a transient string, and now we - # want to intern it permanently, we'll end up with the string - # in both places. That seems fine -- I don't see why we need - # to remove it from the transient map. - value = self._transient_map.get(key) - if value is not NULL: - return value value = _allocate(self.mem, utf8_string, length) - if allow_transient and self._transient_map is not None: - self._transient_map.set(key, value) + self._map.set(key, value) + if allow_transient and self.mem is not self._non_temp_mem: self._transient_keys.push_back(key) else: - self._map.set(key, value) self.keys.push_back(key) return value diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 93b7f63acda..6ca170dd45e 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -517,12 +517,8 @@ cdef class Tokenizer: if n <= 0: # avoid mem alloc of zero length return 0 - # Historically this check was mostly used to avoid caching - # chunks that had tokens owned by the Doc. Now that that's - # not a thing, I don't think we need this? - for i in range(n): - if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL: - return 0 + if self.vocab.in_memory_zone: + return 0 # See #1250 if has_special[0]: return 0 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 97ba5d68c31..11043c17ae7 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -5,6 +5,7 @@ from typing import Iterator, Optional import numpy import srsly from thinc.api import get_array_module, get_current_ops +from preshed.maps cimport map_clear from .attrs cimport LANG, ORTH from .lexeme cimport EMPTY_LEXEME, OOV_RANK, Lexeme @@ -104,7 +105,7 @@ cdef class Vocab: def vectors(self, vectors): if hasattr(vectors, "strings"): for s in vectors.strings: - self.strings.add(s) + self.strings.add(s, allow_transient=False) self._vectors = vectors self._vectors.strings = self.strings @@ -115,6 +116,10 @@ cdef class Vocab: langfunc = self.lex_attr_getters.get(LANG, None) return langfunc("_") if langfunc else "" + @property + def in_memory_zone(self) -> bool: + return self.mem is not self._non_temp_mem + def __len__(self): """The current number of lexemes stored. @@ -218,7 +223,7 @@ cdef class Vocab: # this size heuristic. mem = self.mem lex = mem.alloc(1, sizeof(LexemeC)) - lex.orth = self.strings.add(string) + lex.orth = self.strings.add(string, allow_transient=True) lex.length = len(string) if self.vectors is not None and hasattr(self.vectors, "key2row"): lex.id = self.vectors.key2row.get(lex.orth, OOV_RANK) @@ -239,13 +244,13 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1: self._by_orth.set(lex.orth, lex) self.length += 1 - if is_transient: + if is_transient and self.in_memory_zone: self._transient_orths.push_back(lex.orth) def _clear_transient_orths(self): """Remove transient lexemes from the index (generally at the end of the memory zone)""" for orth in self._transient_orths: - self._by_orth.pop(orth) + map_clear(self._by_orth.c_map, orth) self._transient_orths.clear() def __contains__(self, key): From 4cc3ebe74e0758b7ec36912457365bf342fac590 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2024 20:56:01 +0200 Subject: [PATCH 22/29] Format --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 57b8514815a..0d9aab9e3b4 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2093,7 +2093,7 @@ def replace_listeners( tok2vec.remove_listener(listener, pipe_name) @contextmanager - def memory_zone(self, mem: Optional[Pool]=None) -> Iterator[Pool]: + def memory_zone(self, mem: Optional[Pool] = None) -> Iterator[Pool]: """Begin a block where all resources allocated during the block will be freed at the end of it. If a resources was created within the memory zone block, accessing it outside the block is invalid. From b18cc94451b49c72bd0fb836a143c2fd340ec5db Mon Sep 17 00:00:00 2001 From: marinelay Date: Tue, 10 Sep 2024 03:57:13 +0900 Subject: [PATCH 23/29] Delete unnecessary method (#13441) Co-authored-by: marinelay --- spacy/lang/mk/__init__.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/spacy/lang/mk/__init__.py b/spacy/lang/mk/__init__.py index 413f0038d0e..9470088a1e6 100644 --- a/spacy/lang/mk/__init__.py +++ b/spacy/lang/mk/__init__.py @@ -24,13 +24,6 @@ class MacedonianDefaults(BaseDefaults): tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS) stop_words = STOP_WORDS - @classmethod - def create_lemmatizer(cls, nlp=None, lookups=None): - if lookups is None: - lookups = Lookups() - return MacedonianLemmatizer(lookups) - - class Macedonian(Language): lang = "mk" Defaults = MacedonianDefaults From 5a7ad5572ca3e9de5c45e9747ccdb3a7a215cb3c Mon Sep 17 00:00:00 2001 From: William Mattingly <62964060+wjbmattingly@users.noreply.github.com> Date: Tue, 10 Sep 2024 08:12:52 -0400 Subject: [PATCH 24/29] added gliner-spacy to universe (#13417) [ci skip] Co-authored-by: Sofie Van Landeghem Co-authored-by: Ines Montani --- website/meta/universe.json | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index 6278dd4899b..e1853f50ed9 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4517,7 +4517,35 @@ "website": "https://redfield.ai" }, "category": ["standalone"] + }, + { + "id": "gliner-spacy", + "title": "GLiNER spaCy Wrapper", + "slogan": "Integrating GLiNER's Advanced NER with spaCy", + "description": "GLiNER SpaCy Wrapper is a project that brings together GLiNER, a zero-shot Named Entity Recognition (NER) model, with spaCy's NLP capabilities. It provides an easy way to integrate GLiNER within the spaCy environment, thus enhancing NER tasks with GLiNER's features.", + "github": "theirstory/gliner-spacy", + "pip": "gliner-spacy", + "code_example": [ + "import spacy", + "", + "nlp = spacy.blank('en')", + "nlp.add_pipe('gliner_spacy')", + "text = 'This is a text about Bill Gates and Microsoft.'", + "doc = nlp(text)", + "", + "for ent in doc.ents:", + " print(ent.text, ent.label_)" + ], + "code_language": "python", + "url": "https://github.com/theirstory/gliner-spacy", + "author": "TheirStory", + "author_links": { + "website": "https://theirstory.io" + }, + "category": ["pipeline"], + "tags": ["NER"] } + ], "categories": [ From 54dc4ee8fbe4343ec0ef7a6fc6dfbc33e34c1769 Mon Sep 17 00:00:00 2001 From: Oren Halvani Date: Tue, 10 Sep 2024 14:13:36 +0200 Subject: [PATCH 25/29] Added: Constituent-Treelib to: universe.json (#13432) [ci skip] Co-authored-by: Halvani <> --- website/meta/universe.json | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index e1853f50ed9..46be506659f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -16,6 +16,40 @@ }, "category": ["extension"], "tags": [] + }, + { + "id": "constituent_treelib", + "title": "Constituent Treelib", + "slogan": "Extract constituents with ease!", + "description": "Constituent Treelib (CTL) is a lightweight Python library built on top of benepar (Berkeley Neural Parser) as well as the two well-known NLP frameworks spaCy and NLTK. CTL offers you a convenient way to parse sentences into constituent trees, modify them according to their structure, as well as visualize and export them into various file formats. In addition, you can extract phrases according to their phrasal categories (which can be used e.g., as features for various NLP tasks), validate already parsed sentences in bracket notation or convert them back into sentences.", + "github": "Halvani/Constituent-Treelib", + "pip": "constituent-treelib", + "code_example": [ + "from constituent_treelib import ConstituentTree, Language", + "# Define the language for the sentence as well as for the spaCy and benepar models", + "language = Language.English", + "# Define which specific SpaCy model should be used (default is Medium)", + "spacy_model_size = ConstituentTree.SpacyModelSize.Medium", + "# Create the pipeline (note, the required models will be downloaded and installed automatically)", + "nlp = ConstituentTree.create_pipeline(language, spacy_model_size)", + "# Your sentence", + "sentence = 'We try to explicitly describe the geometry of the edges of the images.'", + "# Create the tree from where we are going to extract the desired noun phrases", + "tree = ConstituentTree(sentence, nlp)", + "all_phrases = tree.extract_all_phrases(min_words_in_phrases=1)", + "print(all_phrases)", + "# {'PP': ['of the edges of the images', 'of the images'], 'NP': ['We', 'the geometry of the edges of the images', 'the geometry', 'the edges of the images', 'the edges', 'the images'], 'S': ['We try to explicitly describe the geometry of the edges of the images .', 'to explicitly describe the geometry of the edges of the images'], 'VP': ['try to explicitly describe the geometry of the edges of the images', 'to explicitly describe the geometry of the edges of the images', 'describe the geometry of the edges of the images'], 'ADVP': ['explicitly']}" + ], + "code_language": "python", + "url": "https://github.com/Halvani/Constituent-Treelib", + "thumb": "https://github.com/Halvani/Constituent-Treelib/blob/main/assets/images/promo_tree.svg", + "author": "Oren Halvani", + "author_links": { + "github": "Halvani", + "website": "https://www.linkedin.com/in/orenhalvani" + }, + "category": ["apis", "standalone", "visualizers"], + "tags": ["apis", "deployment", "constituency ", "parsing"] }, { "id": "sayswho", From 0190e669c5010b5a21eab407162ed4d551469922 Mon Sep 17 00:00:00 2001 From: thjbdvlt <109964512+thjbdvlt@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:17:33 +0200 Subject: [PATCH 26/29] universe-package-quelquhui (#13514) [ci skip] Co-authored-by: Ines Montani --- website/meta/universe.json | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index 46be506659f..ec888727684 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4552,6 +4552,26 @@ }, "category": ["standalone"] }, + { + "id": "quelquhui", + "title": "quelquhui", + "slogan": "Tokenizer for contemporary French", + "description": "A tokenizer for French that handles inword parentheses like in _(b)rouille_, inclusive language (won't split _relecteur.rice.s_,but will split _mais.maintenant_), hyphens (split _peut-on_, or _pouvons-vous_ but not _tubulu-pimpant_), apostrophes (split _j'arrive_ or _j'arrivons_, but not _aujourd'hui_ or _r'garder_), emoticons, text-emoji (_:happy:_), urls, mails and more.", + "github": "thjbdvlt/quelquhui", + "code_example": [ + "import spacy", + "import quelquhui", + "nlp = spacy.load('fr_core_news_lg')", + "nlp.tokenizer = quelquhui.Toquenizer(nlp.vocab)" + ], + "code_language": "python", + "author": "thjbdvlt", + "author_links": { + "github": "thjbdvlt" + }, + "category": ["pipeline"], + "tags": ["tokenizer", "french"] + }, { "id": "gliner-spacy", "title": "GLiNER spaCy Wrapper", @@ -4579,7 +4599,6 @@ "category": ["pipeline"], "tags": ["NER"] } - ], "categories": [ From 081e4e385d9e2e3271f49796dacf88415cebf29b Mon Sep 17 00:00:00 2001 From: thjbdvlt <109964512+thjbdvlt@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:21:41 +0200 Subject: [PATCH 27/29] universe-project-presque (#13515) [ci skip] Co-authored-by: Ines Montani --- website/meta/universe.json | 1390 ++++++++++++++++++++++++++++-------- 1 file changed, 1076 insertions(+), 314 deletions(-) diff --git a/website/meta/universe.json b/website/meta/universe.json index ec888727684..fa71ac2041e 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -14,10 +14,12 @@ "twitter": "@explosion_ai", "github": "explosion" }, - "category": ["extension"], + "category": [ + "extension" + ], "tags": [] }, - { + { "id": "constituent_treelib", "title": "Constituent Treelib", "slogan": "Extract constituents with ease!", @@ -25,20 +27,20 @@ "github": "Halvani/Constituent-Treelib", "pip": "constituent-treelib", "code_example": [ - "from constituent_treelib import ConstituentTree, Language", - "# Define the language for the sentence as well as for the spaCy and benepar models", - "language = Language.English", - "# Define which specific SpaCy model should be used (default is Medium)", - "spacy_model_size = ConstituentTree.SpacyModelSize.Medium", - "# Create the pipeline (note, the required models will be downloaded and installed automatically)", - "nlp = ConstituentTree.create_pipeline(language, spacy_model_size)", - "# Your sentence", - "sentence = 'We try to explicitly describe the geometry of the edges of the images.'", - "# Create the tree from where we are going to extract the desired noun phrases", - "tree = ConstituentTree(sentence, nlp)", - "all_phrases = tree.extract_all_phrases(min_words_in_phrases=1)", - "print(all_phrases)", - "# {'PP': ['of the edges of the images', 'of the images'], 'NP': ['We', 'the geometry of the edges of the images', 'the geometry', 'the edges of the images', 'the edges', 'the images'], 'S': ['We try to explicitly describe the geometry of the edges of the images .', 'to explicitly describe the geometry of the edges of the images'], 'VP': ['try to explicitly describe the geometry of the edges of the images', 'to explicitly describe the geometry of the edges of the images', 'describe the geometry of the edges of the images'], 'ADVP': ['explicitly']}" + "from constituent_treelib import ConstituentTree, Language", + "# Define the language for the sentence as well as for the spaCy and benepar models", + "language = Language.English", + "# Define which specific SpaCy model should be used (default is Medium)", + "spacy_model_size = ConstituentTree.SpacyModelSize.Medium", + "# Create the pipeline (note, the required models will be downloaded and installed automatically)", + "nlp = ConstituentTree.create_pipeline(language, spacy_model_size)", + "# Your sentence", + "sentence = 'We try to explicitly describe the geometry of the edges of the images.'", + "# Create the tree from where we are going to extract the desired noun phrases", + "tree = ConstituentTree(sentence, nlp)", + "all_phrases = tree.extract_all_phrases(min_words_in_phrases=1)", + "print(all_phrases)", + "# {'PP': ['of the edges of the images', 'of the images'], 'NP': ['We', 'the geometry of the edges of the images', 'the geometry', 'the edges of the images', 'the edges', 'the images'], 'S': ['We try to explicitly describe the geometry of the edges of the images .', 'to explicitly describe the geometry of the edges of the images'], 'VP': ['try to explicitly describe the geometry of the edges of the images', 'to explicitly describe the geometry of the edges of the images', 'describe the geometry of the edges of the images'], 'ADVP': ['explicitly']}" ], "code_language": "python", "url": "https://github.com/Halvani/Constituent-Treelib", @@ -48,8 +50,17 @@ "github": "Halvani", "website": "https://www.linkedin.com/in/orenhalvani" }, - "category": ["apis", "standalone", "visualizers"], - "tags": ["apis", "deployment", "constituency ", "parsing"] + "category": [ + "apis", + "standalone", + "visualizers" + ], + "tags": [ + "apis", + "deployment", + "constituency ", + "parsing" + ] }, { "id": "sayswho", @@ -69,12 +80,17 @@ "text = open(\"path/to/your/text_file.txt\").read()", "sw = SaysWho()", "sw.attribute(text)", - "sw.expand_match() # see quote/cluster matches", "sw.render_to_html() # output your text, quotes and cluster matches to an html file called \"temp.html\"" ], - "category": ["standalone"], - "tags": ["attribution", "coref", "text-processing"] + "category": [ + "standalone" + ], + "tags": [ + "attribution", + "coref", + "text-processing" + ] }, { "id": "parsigs", @@ -96,8 +112,16 @@ "author_links": { "github": "royashcenazi" }, - "category": ["model", "research", "biomedical"], - "tags": ["sigs", "prescription","pharma"] + "category": [ + "model", + "research", + "biomedical" + ], + "tags": [ + "sigs", + "prescription", + "pharma" + ] }, { "id": "latincy", @@ -123,8 +147,13 @@ "github": "diyclassics", "website": "https://diyclassics.github.io/" }, - "category": ["pipeline", "research"], - "tags": ["latin"] + "category": [ + "pipeline", + "research" + ], + "tags": [ + "latin" + ] }, { "id": "odycy", @@ -150,8 +179,14 @@ "github": "centre-for-humanities-computing", "website": "https://chc.au.dk/" }, - "category": ["pipeline", "standalone", "research"], - "tags": ["ancient Greek"] + "category": [ + "pipeline", + "standalone", + "research" + ], + "tags": [ + "ancient Greek" + ] }, { "id": "spacy-wasm", @@ -166,8 +201,13 @@ "twitter": "@SyedAhkam1", "github": "SyedAhkam" }, - "category": ["visualizers"], - "tags": ["visualization", "deployment"] + "category": [ + "visualizers" + ], + "tags": [ + "visualization", + "deployment" + ] }, { "id": "spacysee", @@ -193,8 +233,12 @@ "github": "moxley01", "website": "https://mattoxley.com" }, - "category": ["visualizers"], - "tags": ["visualization"] + "category": [ + "visualizers" + ], + "tags": [ + "visualization" + ] }, { "id": "grecy", @@ -223,8 +267,14 @@ "github": "jmyerston", "website": "https://huggingface.co/spaces/Jacobo/syntax" }, - "category": ["pipeline", "research","models"], - "tags": ["ancient Greek"] + "category": [ + "pipeline", + "research", + "models" + ], + "tags": [ + "ancient Greek" + ] }, { "id": "spacy-cleaner", @@ -260,8 +310,12 @@ "github": "Ce11an", "website": "https://www.linkedin.com/in/cellan-hall/" }, - "category": ["extension"], - "tags": ["text-processing"] + "category": [ + "extension" + ], + "tags": [ + "text-processing" + ] }, { "id": "Zshot", @@ -318,7 +372,11 @@ "twitter": "IBMResearch", "website": "https://research.ibm.com/labs/ireland/" }, - "category": ["scientific", "models", "research"] + "category": [ + "scientific", + "models", + "research" + ] }, { "id": "concepcy", @@ -345,9 +403,14 @@ "for token in doc:", " print(f'Word: {token}\n{token._.relatedto}')" ], - "category": ["pipeline"], + "category": [ + "pipeline" + ], "image": "https://github.com/JulesBelveze/concepcy/blob/main/figures/concepcy.png", - "tags": ["semantic", "ConceptNet"], + "tags": [ + "semantic", + "ConceptNet" + ], "author": "Jules Belveze", "author_links": { "github": "JulesBelveze", @@ -375,9 +438,15 @@ "# ('Paris', 'GPE', 'Q90', 'https://www.wikidata.org/wiki/Q90', 0.5652)", "## Set parameter `extra_info` to `True` and check also span._.description, span._.src_description, span._.normal_term, span._.other_ids" ], - "category": ["models", "pipeline"], + "category": [ + "models", + "pipeline" + ], "image": "https://raw.githubusercontent.com/Lucaterre/spacyfishing/main/docs/spacyfishing-logo-resized.png", - "tags": ["NER", "NEL"], + "tags": [ + "NER", + "NEL" + ], "author": "Lucas Terriel", "author_links": { "twitter": "TerreLuca", @@ -391,7 +460,9 @@ "description": "Aim-spaCy helps to easily collect, store and explore training logs for spaCy, including: hyper-parameters, metrics and displaCy visualizations", "github": "aimhubio/aim-spacy", "pip": "aim-spacy", - "code_example": ["https://github.com/aimhubio/aim-spacy/tree/master/examples"], + "code_example": [ + "https://github.com/aimhubio/aim-spacy/tree/master/examples" + ], "code_language": "python", "url": "https://aimstack.io/spacy", "thumb": "https://user-images.githubusercontent.com/13848158/172912427-ee9327ea-3cd8-47fa-8427-6c0d36cd831f.png", @@ -402,8 +473,13 @@ "github": "aimhubio", "website": "https://aimstack.io" }, - "category": ["visualizers"], - "tags": ["experiment-tracking", "visualization"] + "category": [ + "visualizers" + ], + "tags": [ + "experiment-tracking", + "visualization" + ] }, { "id": "spacy-report", @@ -417,7 +493,10 @@ "code_example": [ "python -m spacy report textcat training/model-best/ corpus/train.spacy corpus/dev.spacy" ], - "category": ["visualizers", "research"], + "category": [ + "visualizers", + "research" + ], "author": "Vincent D. Warmerdam", "author_links": { "twitter": "fishnets88", @@ -428,7 +507,9 @@ { "id": "scrubadub_spacy", "title": "scrubadub_spacy", - "category": ["pipeline"], + "category": [ + "pipeline" + ], "slogan": "Remove personally identifiable information from text using spaCy.", "description": "scrubadub removes personally identifiable information from text. scrubadub_spacy is an extension that uses spaCy NLP models to remove personal information from text.", "github": "LeapBeyond/scrubadub_spacy", @@ -451,8 +532,13 @@ { "id": "spacy-setfit-textcat", "title": "spacy-setfit-textcat", - "category": ["research"], - "tags": ["SetFit", "Few-Shot"], + "category": [ + "research" + ], + "tags": [ + "SetFit", + "Few-Shot" + ], "slogan": "spaCy Project: Experiments with SetFit & Few-Shot Classification", "description": "This project is an experiment with spaCy and few-shot text classification using SetFit", "github": "pmbaumgartner/spacy-setfit-textcat", @@ -471,7 +557,9 @@ { "id": "spacy-experimental", "title": "spacy-experimental", - "category": ["extension"], + "category": [ + "extension" + ], "slogan": "Cutting-edge experimental spaCy components and features", "description": "This package includes experimental components and features for spaCy v3.x, for example model architectures, pipeline components and utilities.", "github": "explosion/spacy-experimental", @@ -492,8 +580,12 @@ { "id": "spacypdfreader", "title": "spacypdfreader", - "category": ["pipeline"], - "tags": ["PDF"], + "category": [ + "pipeline" + ], + "tags": [ + "PDF" + ], "slogan": "Easy PDF to text to spaCy text extraction in Python.", "description": "*spacypdfreader* is a Python library that allows you to convert PDF files directly into *spaCy* `Doc` objects. The library provides several built in parsers or bring your own parser. `Doc` objects are annotated with several custom attributes including: `token._.page_number`, `doc._.page_range`, `doc._.first_page`, `doc._.last_page`, `doc._.pdf_file_name`, and `doc._.page(int)`.", "github": "SamEdwardes/spacypdfreader", @@ -550,8 +642,16 @@ "twitter": "cloud_nlp", "website": "https://nlpcloud.io" }, - "category": ["apis", "nonpython", "standalone"], - "tags": ["api", "deploy", "production"] + "category": [ + "apis", + "nonpython", + "standalone" + ], + "tags": [ + "api", + "deploy", + "production" + ] }, { "id": "eMFDscore", @@ -576,8 +676,15 @@ "github": "medianeuroscience", "twitter": "medianeuro" }, - "category": ["research", "teaching"], - "tags": ["morality", "dictionary", "sentiment"] + "category": [ + "research", + "teaching" + ], + "tags": [ + "morality", + "dictionary", + "sentiment" + ] }, { "id": "skweak", @@ -629,7 +736,12 @@ "github": "plison", "website": "https://www.nr.no/~plison" }, - "category": ["pipeline", "standalone", "research", "training"], + "category": [ + "pipeline", + "standalone", + "research", + "training" + ], "tags": [], "spacy_version": 3 }, @@ -653,7 +765,9 @@ "github": "jaidevd", "twitter": "jaidevd" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "spacy-dbpedia-spotlight", @@ -675,7 +789,10 @@ "# inspect the raw data from DBpedia spotlight", "print(doc.ents[0]._.dbpedia_raw_result)" ], - "category": ["models", "pipeline"], + "category": [ + "models", + "pipeline" + ], "author": "Martino Mensio", "author_links": { "twitter": "MartinoMensio", @@ -716,8 +833,13 @@ "github": "SamEdwardes", "website": "https://samedwardes.com" }, - "category": ["pipeline"], - "tags": ["sentiment", "textblob"], + "category": [ + "pipeline" + ], + "tags": [ + "sentiment", + "textblob" + ], "spacy_version": 3 }, { @@ -737,7 +859,10 @@ "# use the similarity method that is based on the vectors, on Doc, Span or Token", "print(doc_1.similarity(doc_2[0:7]))" ], - "category": ["models", "pipeline"], + "category": [ + "models", + "pipeline" + ], "author": "Martino Mensio", "author_links": { "twitter": "MartinoMensio", @@ -752,7 +877,9 @@ "github": "explosion/spacy-streamlit", "description": "This package contains utilities for visualizing spaCy models and building interactive spaCy-powered apps with [Streamlit](https://streamlit.io). It includes various building blocks you can use in your own Streamlit app, like visualizers for **syntactic dependencies**, **named entities**, **text classification**, **semantic similarity** via word vectors, token attributes, and more.", "pip": "spacy-streamlit", - "category": ["visualizers"], + "category": [ + "visualizers" + ], "thumb": "https://i.imgur.com/mhEjluE.jpg", "image": "https://user-images.githubusercontent.com/13643239/85388081-f2da8700-b545-11ea-9bd4-e303d3c5763c.png", "code_example": [ @@ -800,8 +927,13 @@ "twitter": "gandersen101", "github": "gandersen101" }, - "category": ["pipeline"], - "tags": ["fuzzy-matching", "regex"] + "category": [ + "pipeline" + ], + "tags": [ + "fuzzy-matching", + "regex" + ] }, { "id": "spacy-universal-sentence-encoder", @@ -820,7 +952,10 @@ "# use the similarity method that is based on the vectors, on Doc, Span or Token", "print(doc_1.similarity(doc_2[0:7]))" ], - "category": ["models", "pipeline"], + "category": [ + "models", + "pipeline" + ], "author": "Martino Mensio", "author_links": { "twitter": "MartinoMensio", @@ -847,7 +982,10 @@ "emb = lang[words]", "emb.plot_interactive(x_axis='man', y_axis='woman')" ], - "category": ["visualizers", "research"], + "category": [ + "visualizers", + "research" + ], "author": "Vincent D. Warmerdam", "author_links": { "twitter": "fishnets88", @@ -878,7 +1016,10 @@ "fig = topic_model.visualize_topics()", "fig.show()" ], - "category": ["visualizers", "training"], + "category": [ + "visualizers", + "training" + ], "author": "Maarten Grootendorst", "author_links": { "twitter": "maartengr", @@ -921,7 +1062,10 @@ "# This is where we attach our pre-trained model as a pipeline step.", "attach_sklearn_categoriser(nlp, pipe_name='silly_sentiment', estimator=pipe)" ], - "category": ["pipeline", "training"], + "category": [ + "pipeline", + "training" + ], "author": "Vincent D. Warmerdam", "author_links": { "twitter": "fishnets88", @@ -932,8 +1076,12 @@ { "id": "Klayers", "title": "Klayers", - "category": ["pipeline"], - "tags": ["AWS"], + "category": [ + "pipeline" + ], + "tags": [ + "AWS" + ], "slogan": "spaCy as a AWS Lambda Layer", "description": "A collection of Python Packages as AWS Lambda(λ) Layers", "github": "keithrozario/Klayers", @@ -970,13 +1118,19 @@ "github": "Applied-Language-Technology", "website": "https://applied-language-technology.mooc.fi/" }, - "category": ["videos"] + "category": [ + "videos" + ] }, { "id": "HuSpaCy", "title": "HuSpaCy", - "category": ["models"], - "tags": ["Hungarian"], + "category": [ + "models" + ], + "tags": [ + "Hungarian" + ], "slogan": "HuSpaCy: industrial-strength Hungarian natural language processing", "description": "HuSpaCy is a spaCy model and a library providing industrial-strength Hungarian language processing facilities.", "github": "huspacy/huspacy", @@ -1027,7 +1181,12 @@ " print(token.text, token.lemma_, token.pos_, token.dep_, token.ent_type_)", "print(doc.ents)" ], - "category": ["pipeline", "standalone", "models", "research"], + "category": [ + "pipeline", + "standalone", + "models", + "research" + ], "author": "Explosion", "author_links": { "twitter": "explosion_ai", @@ -1054,7 +1213,12 @@ "for token in doc:", " print(token.text, token.lemma_, token.pos_, token.dep_)" ], - "category": ["pipeline", "standalone", "models", "research"], + "category": [ + "pipeline", + "standalone", + "models", + "research" + ], "author": "TakeLab", "author_links": { "github": "TakeLab", @@ -1064,7 +1228,7 @@ { "id": "spacy-server", "title": "spaCy Server", - "slogan": "\uD83E\uDD9C Containerized HTTP API for spaCy NLP", + "slogan": "🦜 Containerized HTTP API for spaCy NLP", "description": "For developers who need programming language agnostic NLP, spaCy Server is a containerized HTTP API that provides industrial-strength natural language processing. Unlike other servers, our server is fast, idiomatic, and well documented.", "github": "neelkamath/spacy-server", "code_example": [ @@ -1078,8 +1242,12 @@ "github": "neelkamath", "website": "https://neelkamath.com" }, - "category": ["apis"], - "tags": ["docker"] + "category": [ + "apis" + ], + "tags": [ + "docker" + ] }, { "id": "nlp-architect", @@ -1088,8 +1256,13 @@ "github": "NervanaSystems/nlp-architect", "pip": "nlp-architect", "thumb": "https://i.imgur.com/vMideRx.png", - "category": ["standalone", "research"], - "tags": ["pytorch"] + "category": [ + "standalone", + "research" + ], + "tags": [ + "pytorch" + ] }, { "id": "Chatterbot", @@ -1116,8 +1289,13 @@ "author_links": { "github": "gunthercox" }, - "category": ["conversational", "standalone"], - "tags": ["chatbots"] + "category": [ + "conversational", + "standalone" + ], + "tags": [ + "chatbots" + ] }, { "id": "alibi", @@ -1133,7 +1311,10 @@ "explainer.explain(x)" ], "author": "Seldon", - "category": ["standalone", "research"] + "category": [ + "standalone", + "research" + ] }, { "id": "spacymoji", @@ -1141,8 +1322,13 @@ "github": "ines/spacymoji", "description": "spaCy extension and pipeline component for adding emoji meta data to `Doc` objects. Detects emoji consisting of one or more unicode characters, and can optionally merge multi-char emoji (combined pictures, emoji with skin tone modifiers) into one token. Human-readable emoji descriptions are added as a custom attribute, and an optional lookup table can be provided for your own descriptions. The extension sets the custom `Doc`, `Token` and `Span` attributes `._.is_emoji`, `._.emoji_desc`, `._.has_emoji` and `._.emoji`.", "pip": "spacymoji", - "category": ["pipeline"], - "tags": ["emoji", "unicode"], + "category": [ + "pipeline" + ], + "tags": [ + "emoji", + "unicode" + ], "thumb": "https://i.imgur.com/XOTYIgn.jpg", "code_example": [ "import spacy", @@ -1185,8 +1371,14 @@ "# ('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.1099332471902863)", "## Check also span._.types, span._.aliases, span._.rank" ], - "category": ["models", "pipeline"], - "tags": ["NER", "NEL"], + "category": [ + "models", + "pipeline" + ], + "tags": [ + "NER", + "NEL" + ], "author": "Renat Shigapov", "author_links": { "twitter": "_shigapov", @@ -1215,7 +1407,9 @@ "author_links": { "github": "mholtzscher" }, - "category": ["pipeline"] + "category": [ + "pipeline" + ] }, { "id": "spacy_cld", @@ -1240,7 +1434,9 @@ "author_links": { "github": "nickdavidhaynes" }, - "category": ["pipeline"] + "category": [ + "pipeline" + ] }, { "id": "spacy-iwnlp", @@ -1263,8 +1459,13 @@ "author_links": { "github": "Liebeck" }, - "category": ["pipeline"], - "tags": ["lemmatizer", "german"] + "category": [ + "pipeline" + ], + "tags": [ + "lemmatizer", + "german" + ] }, { "id": "spacy-sentiws", @@ -1287,8 +1488,13 @@ "author_links": { "github": "Liebeck" }, - "category": ["pipeline"], - "tags": ["sentiment", "german"] + "category": [ + "pipeline" + ], + "tags": [ + "sentiment", + "german" + ] }, { "id": "spacy-lefff", @@ -1313,8 +1519,14 @@ "author_links": { "github": "sammous" }, - "category": ["pipeline"], - "tags": ["pos", "lemmatizer", "french"] + "category": [ + "pipeline" + ], + "tags": [ + "pos", + "lemmatizer", + "french" + ] }, { "id": "lemmy", @@ -1342,8 +1554,13 @@ "author_links": { "github": "sorenlind" }, - "category": ["pipeline"], - "tags": ["lemmatizer", "danish"] + "category": [ + "pipeline" + ], + "tags": [ + "lemmatizer", + "danish" + ] }, { "id": "augmenty", @@ -1373,8 +1590,15 @@ "github": "kennethenevoldsen", "website": "https://www.kennethenevoldsen.com" }, - "category": ["training", "research"], - "tags": ["training", "research", "augmentation"] + "category": [ + "training", + "research" + ], + "tags": [ + "training", + "research", + "augmentation" + ] }, { "id": "dacy", @@ -1398,8 +1622,13 @@ "github": "centre-for-humanities-computing", "website": "https://chcaa.io/#/" }, - "category": ["pipeline"], - "tags": ["pipeline", "danish"] + "category": [ + "pipeline" + ], + "tags": [ + "pipeline", + "danish" + ] }, { "id": "spacy-wrap", @@ -1440,8 +1669,16 @@ "github": "KennethEnevoldsen", "website": "https://www.kennethenevoldsen.com" }, - "category": ["pipeline", "models", "training"], - "tags": ["pipeline", "models", "transformers"] + "category": [ + "pipeline", + "models", + "training" + ], + "tags": [ + "pipeline", + "models", + "transformers" + ] }, { "id": "asent", @@ -1480,8 +1717,15 @@ "github": "KennethEnevoldsen", "website": "https://www.kennethenevoldsen.com" }, - "category": ["pipeline", "models"], - "tags": ["pipeline", "models", "sentiment"] + "category": [ + "pipeline", + "models" + ], + "tags": [ + "pipeline", + "models", + "sentiment" + ] }, { "id": "textdescriptives", @@ -1503,8 +1747,15 @@ "author_links": { "github": "HLasse" }, - "category": ["pipeline"], - "tags": ["pipeline", "readability", "syntactic complexity", "descriptive statistics"] + "category": [ + "pipeline" + ], + "tags": [ + "pipeline", + "readability", + "syntactic complexity", + "descriptive statistics" + ] }, { "id": "neuralcoref", @@ -1529,8 +1780,14 @@ "author_links": { "github": "huggingface" }, - "category": ["standalone", "conversational", "models"], - "tags": ["coref"] + "category": [ + "standalone", + "conversational", + "models" + ], + "tags": [ + "coref" + ] }, { "id": "neuralcoref-vizualizer", @@ -1541,8 +1798,14 @@ "image": "https://i.imgur.com/3yy4Qyf.png", "thumb": "https://i.imgur.com/j6FO9O6.jpg", "github": "huggingface/neuralcoref", - "category": ["visualizers", "conversational"], - "tags": ["coref", "chatbots"], + "category": [ + "visualizers", + "conversational" + ], + "tags": [ + "coref", + "chatbots" + ], "author": "Hugging Face", "author_links": { "github": "huggingface" @@ -1562,7 +1825,9 @@ "github": "ines", "website": "https://ines.io" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "displacy", @@ -1578,7 +1843,9 @@ "github": "ines", "website": "https://ines.io" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "displacy-ent", @@ -1594,7 +1861,9 @@ "github": "ines", "website": "https://ines.io" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "explacy", @@ -1613,7 +1882,9 @@ "author_links": { "github": "tylerneylon" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "deplacy", @@ -1633,7 +1904,9 @@ "author_links": { "github": "KoichiYasuoka" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "scattertext", @@ -1669,7 +1942,9 @@ "github": "JasonKessler", "twitter": "jasonkessler" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "rasa", @@ -1684,8 +1959,12 @@ "author_links": { "github": "RasaHQ" }, - "category": ["conversational"], - "tags": ["chatbots"] + "category": [ + "conversational" + ], + "tags": [ + "chatbots" + ] }, { "id": "mindmeld", @@ -1695,8 +1974,13 @@ "github": "cisco/mindmeld", "pip": "mindmeld", "thumb": "https://www.mindmeld.com/img/mindmeld-logo.png", - "category": ["conversational", "ner"], - "tags": ["chatbots"], + "category": [ + "conversational", + "ner" + ], + "tags": [ + "chatbots" + ], "author": "Cisco", "author_links": { "github": "cisco/mindmeld", @@ -1721,8 +2005,13 @@ "... fields={'sentence_tokenized': ('text', data.Field(sequential=True)),", "... 'sentiment_gold': ('labels', data.Field(sequential=False))})" ], - "category": ["standalone", "research"], - "tags": ["pytorch"] + "category": [ + "standalone", + "research" + ], + "tags": [ + "pytorch" + ] }, { "id": "allennlp", @@ -1739,7 +2028,10 @@ "twitter": "allenai_org", "website": "http://allenai.org" }, - "category": ["standalone", "research"] + "category": [ + "standalone", + "research" + ] }, { "id": "scispacy", @@ -1755,7 +2047,12 @@ "twitter": "allenai_org", "website": "http://allenai.org" }, - "category": ["scientific", "models", "research", "biomedical"] + "category": [ + "scientific", + "models", + "research", + "biomedical" + ] }, { "id": "textacy", @@ -1769,7 +2066,9 @@ "github": "bdewilde", "twitter": "bjdewilde" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "textpipe", @@ -1782,8 +2081,13 @@ "github": "textpipe", "website": "https://github.com/textpipe/textpipe/blob/master/CONTRIBUTORS.md" }, - "category": ["standalone"], - "tags": ["text-processing", "named-entity-recognition"], + "category": [ + "standalone" + ], + "tags": [ + "text-processing", + "named-entity-recognition" + ], "thumb": "https://avatars0.githubusercontent.com/u/40492530", "code_example": [ "from textpipe import doc, pipeline", @@ -1818,7 +2122,10 @@ "github": "ahalterman", "twitter": "ahalterman" }, - "category": ["standalone", "scientific"] + "category": [ + "standalone", + "scientific" + ] }, { "id": "kindred", @@ -1843,7 +2150,10 @@ "author_links": { "github": "jakelever" }, - "category": ["standalone", "scientific"] + "category": [ + "standalone", + "scientific" + ] }, { "id": "sense2vec", @@ -1870,8 +2180,14 @@ "# (('computer vision', 'NOUN'), 0.8636297),", "# (('deep learning', 'NOUN'), 0.8573361)]" ], - "category": ["pipeline", "standalone", "visualizers"], - "tags": ["vectors"], + "category": [ + "pipeline", + "standalone", + "visualizers" + ], + "tags": [ + "vectors" + ], "author": "Explosion", "author_links": { "twitter": "explosion_ai", @@ -1896,7 +2212,9 @@ ], "code_language": "r", "author": "Kenneth Benoit & Aki Matsuo", - "category": ["nonpython"] + "category": [ + "nonpython" + ] }, { "id": "cleannlp", @@ -1909,7 +2227,9 @@ "author_links": { "github": "statsmaths" }, - "category": ["nonpython"] + "category": [ + "nonpython" + ] }, { "id": "spacy-cpp", @@ -1928,7 +2248,9 @@ "author_links": { "github": "d99kris" }, - "category": ["nonpython"] + "category": [ + "nonpython" + ] }, { "id": "ruby-spacy", @@ -1956,8 +2278,12 @@ "github": "yohasebe", "twitter": "yohasebe" }, - "category": ["nonpython"], - "tags": ["ruby"] + "category": [ + "nonpython" + ], + "tags": [ + "ruby" + ] }, { "id": "spacy_api", @@ -1974,7 +2300,9 @@ "author_links": { "github": "kootenpv" }, - "category": ["apis"] + "category": [ + "apis" + ] }, { "id": "spacy-api-docker", @@ -1997,7 +2325,9 @@ "author_links": { "github": "jgontrum" }, - "category": ["apis"] + "category": [ + "apis" + ] }, { "id": "spacy-nlp", @@ -2016,7 +2346,10 @@ "author_links": { "github": "kengz" }, - "category": ["apis", "nonpython"] + "category": [ + "apis", + "nonpython" + ] }, { "id": "prodigy", @@ -2034,7 +2367,10 @@ "✨ Starting the web server on port 8080..." ], "code_language": "bash", - "category": ["standalone", "training"], + "category": [ + "standalone", + "training" + ], "author": "Explosion", "author_links": { "twitter": "explosion_ai", @@ -2054,7 +2390,9 @@ "github": "DragonComputer", "website": "http://dragon.computer" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "prefect", @@ -2079,7 +2417,9 @@ "author_links": { "website": "https://prefect.io" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "graphbrain", @@ -2090,7 +2430,9 @@ "pip": "graphbrain", "thumb": "https://i.imgur.com/cct9W1E.png", "author": "Graphbrain", - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "type": "education", @@ -2101,7 +2443,9 @@ "cover": "https://i.imgur.com/w0iycjl.jpg", "url": "https://nostarch.com/NLPPython", "author": "Yuli Vasiliev", - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2112,7 +2456,9 @@ "cover": "https://covers.oreillystatic.com/images/0636920030515/lrg.jpg", "url": "http://shop.oreilly.com/product/0636920030515.do", "author": "Andreas Müller, Sarah Guido", - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2124,7 +2470,9 @@ "cover": "https://i.imgur.com/AOmzZu8.png", "url": "https://www.amazon.com/Text-Analytics-Python-Real-World-Actionable/dp/148422387X", "author": "Dipanjan Sarkar", - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2136,7 +2484,9 @@ "cover": "https://i.imgur.com/5F4mkt7.jpg", "url": "https://www.amazon.com/Practical-Machine-Learning-Python-Problem-Solvers/dp/1484232062", "author": "Dipanjan Sarkar, Raghav Bali, Tushar Sharma", - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2147,7 +2497,9 @@ "cover": "https://i.imgur.com/aleMf1Y.jpg", "url": "https://www.amazon.com/Natural-Language-Processing-Computational-Linguistics-ebook/dp/B07BWH779J", "author": "Bhargav Srinivasa-Desikan", - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2163,7 +2515,9 @@ "github": "DuyguA", "website": "https://www.linkedin.com/in/duygu-altinok-4021389a" }, - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2179,7 +2533,9 @@ "github": "aapatel09", "website": "https://www.ankurapatel.io" }, - "category": ["books"] + "category": [ + "books" + ] }, { "type": "education", @@ -2189,7 +2545,9 @@ "url": "http://spacy.pythonhumanities.com/", "thumb": "https://spacy.pythonhumanities.com/_static/freecodecamp_small.jpg", "author": "Dr. W.J.B. Mattingly", - "category": ["courses"] + "category": [ + "courses" + ] }, { "type": "education", @@ -2206,7 +2564,9 @@ "github": "ines", "website": "https://ines.io" }, - "category": ["courses"] + "category": [ + "courses" + ] }, { "type": "education", @@ -2223,7 +2583,9 @@ "github": "thiippal", "website": "https://www.mv.helsinki.fi/home/thiippal/" }, - "category": ["courses"] + "category": [ + "courses" + ] }, { "type": "education", @@ -2238,7 +2600,9 @@ "github": "honnibal", "website": "https://explosion.ai" }, - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2253,7 +2617,9 @@ "website": "https://explosion.ai" }, "youtube": "jpWqz85F_4Y", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2263,7 +2629,9 @@ "description": "Academic and industry research in Natural Language Processing (NLP) has progressed at an accelerating pace over the last several years. Members of the Python community have been hard at work moving cutting-edge research out of papers and into open source, \"batteries included\" software libraries that can be applied to practical problems. We'll explore some of these tools for modern NLP in Python.", "author": "Patrick Harrison", "youtube": "6zm9NC9uRkk", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2277,7 +2645,9 @@ "github": "ines" }, "youtube": "THduWAnG97k", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2291,7 +2661,9 @@ "github": "ines" }, "youtube": "K1elwpgDdls", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2304,7 +2676,9 @@ "twitter": "Mariacamilagl30" }, "youtube": "RNiLVCE5d4k", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2318,7 +2692,9 @@ "github": "koaning" }, "youtube": "WnGPv6HnBok", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2332,7 +2708,9 @@ "github": "koaning" }, "youtube": "KL4-Mpgbahw", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2346,7 +2724,9 @@ "github": "koaning" }, "youtube": "4V0JDdohxAk", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2360,7 +2740,9 @@ "github": "koaning" }, "youtube": "IqOJU1-_Fi0", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2374,7 +2756,9 @@ "github": "koaning" }, "youtube": "f4sqeLRzkPg", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2388,7 +2772,9 @@ "github": "koaning" }, "youtube": "k77RrmMaKEI", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2402,7 +2788,9 @@ "github": "svlandeg" }, "youtube": "PW3RJM8tDGo", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2416,7 +2804,9 @@ "github": "guadi1994" }, "youtube": "88zcQODyuko", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2430,7 +2820,9 @@ "github": "DeNeutoy" }, "youtube": "2_HSKDALwuw", - "category": ["videos"] + "category": [ + "videos" + ] }, { "type": "education", @@ -2445,7 +2837,9 @@ "author_links": { "website": "https://soundcloud.com/nlp-highlights" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2461,7 +2855,9 @@ "author_links": { "website": "https://www.podcastinit.com" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2477,7 +2873,9 @@ "author_links": { "website": "https://www.podcastinit.com" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2492,7 +2890,9 @@ "author_links": { "website": "https://talkpython.fm/" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2508,7 +2908,9 @@ "author_links": { "website": "https://twimlai.com" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2524,7 +2926,9 @@ "website": "https://www.analyticsvidhya.com", "twitter": "analyticsvidhya" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2539,7 +2943,9 @@ "website": "https://changelog.com/practicalai", "twitter": "PracticalAIFM" }, - "category": ["podcasts"] + "category": [ + "podcasts" + ] }, { "type": "education", @@ -2551,7 +2957,9 @@ "github": "svlandeg" }, "youtube": "8u57WSXVpmw", - "category": ["videos"] + "category": [ + "videos" + ] }, { "id": "self-attentive-parser", @@ -2579,7 +2987,10 @@ "github": "nikitakit", "website": "http://kitaev.io" }, - "category": ["research", "pipeline"] + "category": [ + "research", + "pipeline" + ] }, { "id": "spacy-graphql", @@ -2588,8 +2999,12 @@ "github": "ines/spacy-graphql", "description": "A very simple and experimental app that lets you query spaCy's linguistic annotations using [GraphQL](https://graphql.org/). The API currently supports most token attributes, named entities, sentences and text categories (if available as `doc.cats`, i.e. if you added a text classifier to a model). The `meta` field will return the model meta data. Models are only loaded once and kept in memory.", "url": "https://explosion.ai/demos/spacy-graphql", - "category": ["apis"], - "tags": ["graphql"], + "category": [ + "apis" + ], + "tags": [ + "graphql" + ], "thumb": "https://i.imgur.com/xC7zpTO.png", "code_example": [ "{", @@ -2647,8 +3062,12 @@ "github": "ines", "website": "https://ines.io" }, - "category": ["nonpython"], - "tags": ["javascript"] + "category": [ + "nonpython" + ], + "tags": [ + "javascript" + ] }, { "id": "spacy-wordnet", @@ -2656,7 +3075,10 @@ "slogan": "WordNet meets spaCy", "description": "`spacy-wordnet` creates annotations that easily allow the use of WordNet and [WordNet Domains](http://wndomains.fbk.eu/) by using the [NLTK WordNet interface](http://www.nltk.org/howto/wordnet.html)", "github": "recognai/spacy-wordnet", - "tags": ["wordnet", "synsets"], + "tags": [ + "wordnet", + "synsets" + ], "thumb": "https://i.imgur.com/ud4C7cj.png", "code_example": [ "import spacy", @@ -2684,7 +3106,9 @@ "twitter": "recogn_ai", "website": "https://recogn.ai" }, - "category": ["pipeline"] + "category": [ + "pipeline" + ] }, { "id": "spacy-conll", @@ -2717,8 +3141,16 @@ "website": "http://bramvanroy.be" }, "github": "BramVanroy/spacy_conll", - "category": ["standalone", "pipeline"], - "tags": ["linguistics", "computational linguistics", "conll", "conll-u"] + "category": [ + "standalone", + "pipeline" + ], + "tags": [ + "linguistics", + "computational linguistics", + "conll", + "conll-u" + ] }, { "id": "ludwig", @@ -2735,7 +3167,10 @@ "twitter": "w4nderlus7", "website": "http://w4nderlu.st" }, - "category": ["standalone", "research"] + "category": [ + "standalone", + "research" + ] }, { "id": "pic2phrase_bot", @@ -2749,7 +3184,10 @@ "author_links": { "twitter": "VasilievYuli" }, - "category": ["standalone", "conversational"] + "category": [ + "standalone", + "conversational" + ] }, { "id": "pyInflect", @@ -2770,8 +3208,12 @@ "author_links": { "github": "bjascob" }, - "category": ["pipeline"], - "tags": ["inflection"] + "category": [ + "pipeline" + ], + "tags": [ + "inflection" + ] }, { "id": "lemminflect", @@ -2793,8 +3235,13 @@ "author_links": { "github": "bjascob" }, - "category": ["pipeline"], - "tags": ["inflection", "lemmatizer"] + "category": [ + "pipeline" + ], + "tags": [ + "inflection", + "lemmatizer" + ] }, { "id": "amrlib", @@ -2816,7 +3263,9 @@ "author_links": { "github": "bjascob" }, - "category": ["pipeline"] + "category": [ + "pipeline" + ] }, { "id": "classyclassification", @@ -2857,7 +3306,10 @@ "github": "davidberenstein1957", "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" }, - "category": ["pipeline", "standalone"], + "category": [ + "pipeline", + "standalone" + ], "tags": [ "classification", "zero-shot", @@ -2909,8 +3361,14 @@ "github": "davidberenstein1957", "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" }, - "category": ["pipeline"], - "tags": ["ner", "few-shot", "gensim"], + "category": [ + "pipeline" + ], + "tags": [ + "ner", + "few-shot", + "gensim" + ], "spacy_version": 3 }, { @@ -2960,8 +3418,16 @@ "github": "davidberenstein1957", "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" }, - "category": ["pipeline", "standalone"], - "tags": ["coreference", "multi-lingual", "cross-lingual", "allennlp"], + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "coreference", + "multi-lingual", + "cross-lingual", + "allennlp" + ], "spacy_version": 3 }, { @@ -3012,8 +3478,16 @@ "github": "davidberenstein1957", "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" }, - "category": ["standalone"], - "tags": ["ner", "few-shot", "augmentation", "datasets", "training"], + "category": [ + "standalone" + ], + "tags": [ + "ner", + "few-shot", + "augmentation", + "datasets", + "training" + ], "spacy_version": 3 }, { @@ -3025,43 +3499,49 @@ "pip": "spacy-setfit", "thumb": "https://raw.githubusercontent.com/davidberenstein1957/spacy-setfit/main/logo.png", "code_example": [ - "import spacy", - "", - "# Create some example data", - "train_dataset = {", - " \"inlier\": [", - " \"Text about furniture\",", - " \"Couches, benches and televisions.\",", - " \"I really need to get a new sofa.\"", - " ],", - " \"outlier\": [", - " \"Text about kitchen equipment\",", - " \"This text is about politics\",", - " \"Comments about AI and stuff.\"", - " ]", - "}", - "", - "# Load the spaCy language model:", - "nlp = spacy.load(\"en_core_web_sm\")", - "", - "# Add the \"spacy_setfit\" pipeline component to the spaCy model, and configure it with SetFit parameters:", - "nlp.add_pipe(\"spacy_setfit\", config={", - " \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",", - " \"setfit_trainer_args\": {", - " \"train_dataset\": train_dataset", - " }", - "})", - "doc = nlp(\"I really need to get a new sofa.\")", - "doc.cats", - "# {'inlier': 0.902350975129, 'outlier': 0.097649024871}" + "import spacy", + "", + "# Create some example data", + "train_dataset = {", + " \"inlier\": [", + " \"Text about furniture\",", + " \"Couches, benches and televisions.\",", + " \"I really need to get a new sofa.\"", + " ],", + " \"outlier\": [", + " \"Text about kitchen equipment\",", + " \"This text is about politics\",", + " \"Comments about AI and stuff.\"", + " ]", + "}", + "", + "# Load the spaCy language model:", + "nlp = spacy.load(\"en_core_web_sm\")", + "", + "# Add the \"spacy_setfit\" pipeline component to the spaCy model, and configure it with SetFit parameters:", + "nlp.add_pipe(\"spacy_setfit\", config={", + " \"pretrained_model_name_or_path\": \"paraphrase-MiniLM-L3-v2\",", + " \"setfit_trainer_args\": {", + " \"train_dataset\": train_dataset", + " }", + "})", + "doc = nlp(\"I really need to get a new sofa.\")", + "doc.cats", + "# {'inlier': 0.902350975129, 'outlier': 0.097649024871}" ], "author": "David Berenstein", "author_links": { "github": "davidberenstein1957", "website": "https://www.linkedin.com/in/david-berenstein-1bab11105/" }, - "category": ["pipeline"], - "tags": ["few-shot", "SetFit", "training"], + "category": [ + "pipeline" + ], + "tags": [ + "few-shot", + "SetFit", + "training" + ], "spacy_version": 3 }, { @@ -3079,7 +3559,11 @@ "twitter": "ICLRanD", "website": "https://research.iclr.co.uk" }, - "category": ["scientific", "models", "research"] + "category": [ + "scientific", + "models", + "research" + ] }, { "id": "NGym", @@ -3091,8 +3575,12 @@ "image": "https://github.com/d5555/NeuralGym/raw/master/NGym.png", "thumb": "https://github.com/d5555/NeuralGym/raw/master/NGym/web.png", "author": "d5555", - "category": ["training"], - "tags": ["windows"] + "category": [ + "training" + ], + "tags": [ + "windows" + ] }, { "id": "holmes", @@ -3102,8 +3590,14 @@ "url": "https://github.com/explosion/holmes-extractor", "description": "Holmes is a Python 3 library that supports a number of use cases involving information extraction from English and German texts, including chatbot, structural extraction, topic matching and supervised document classification. There is a [website demonstrating intelligent search based on topic matching](https://holmes-demo.explosion.services).", "pip": "holmes-extractor", - "category": ["pipeline", "standalone"], - "tags": ["chatbots", "text-processing"], + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "chatbots", + "text-processing" + ], "thumb": "https://raw.githubusercontent.com/explosion/holmes-extractor/master/docs/holmes_thumbnail.png", "code_example": [ "import holmes_extractor as holmes", @@ -3124,8 +3618,15 @@ "url": "https://github.com/explosion/coreferee", "description": "Coreferee is a pipeline plugin that performs coreference resolution for English, French, German and Polish. It is designed so that it is easy to add support for new languages and optimised for limited training data. It uses a mixture of neural networks and programmed rules. Please note you will need to [install models](https://github.com/explosion/coreferee#getting-started) before running the code example.", "pip": "coreferee", - "category": ["pipeline", "models", "standalone"], - "tags": ["coreference-resolution", "anaphora"], + "category": [ + "pipeline", + "models", + "standalone" + ], + "tags": [ + "coreference-resolution", + "anaphora" + ], "code_example": [ "import coreferee, spacy", "nlp = spacy.load('en_core_web_trf')", @@ -3157,7 +3658,11 @@ "github": "explosion/spacy-transformers", "url": "https://explosion.ai/blog/spacy-transformers", "pip": "spacy-transformers", - "category": ["pipeline", "models", "research"], + "category": [ + "pipeline", + "models", + "research" + ], "code_example": [ "import spacy", "", @@ -3180,7 +3685,10 @@ "thumb": "https://i.imgur.com/j6FO9O6.jpg", "url": "https://github.com/explosion/spacy-huggingface-hub", "pip": "spacy-huggingface-hub", - "category": ["pipeline", "models"], + "category": [ + "pipeline", + "models" + ], "author": "Explosion", "author_links": { "twitter": "explosion_ai", @@ -3195,7 +3703,11 @@ "github": "mmxgn/spacy-clausie", "url": "https://github.com/mmxgn/spacy-clausie", "description": "ClausIE, a novel, clause-based approach to open information extraction, which extracts relations and their arguments from natural language text", - "category": ["pipeline", "scientific", "research"], + "category": [ + "pipeline", + "scientific", + "research" + ], "code_example": [ "import spacy", "import claucy", @@ -3238,7 +3750,9 @@ "author_links": { "github": "kuk" }, - "category": ["visualizers"] + "category": [ + "visualizers" + ] }, { "id": "negspacy", @@ -3248,8 +3762,14 @@ "url": "https://github.com/jenojp/negspacy", "description": "negspacy is a spaCy pipeline component that evaluates whether Named Entities are negated in text. It adds an extension to 'Span' objects.", "pip": "negspacy", - "category": ["pipeline", "scientific"], - "tags": ["negation", "text-processing"], + "category": [ + "pipeline", + "scientific" + ], + "tags": [ + "negation", + "text-processing" + ], "thumb": "https://github.com/jenojp/negspacy/blob/master/docs/thumb.png?raw=true", "image": "https://github.com/jenojp/negspacy/blob/master/docs/icon.png?raw=true", "code_example": [ @@ -3276,8 +3796,14 @@ "github": "dumitrescustefan/ronec", "url": "https://github.com/dumitrescustefan/ronec", "description": "The corpus holds 5127 sentences, annotated with 16 classes, with a total of 26376 annotated entities. The corpus comes into two formats: BRAT and CONLLUP.", - "category": ["standalone", "models"], - "tags": ["ner", "romanian"], + "category": [ + "standalone", + "models" + ], + "tags": [ + "ner", + "romanian" + ], "thumb": "https://raw.githubusercontent.com/dumitrescustefan/ronec/master/res/thumb.png", "code_example": [ "# to train a new model on ronec", @@ -3305,7 +3831,10 @@ "description": "This spaCy project trains an NER model and a custom Text Classification model with Clause Segmentation and Blinding capabilities to analyze supplement reviews and their potential effects on health.", "github": "explosion/healthsea", "thumb": "https://github.com/explosion/healthsea/blob/main/img/Jellyfish.png", - "category": ["pipeline", "research"], + "category": [ + "pipeline", + "research" + ], "code_example": [ "import spacy", "", @@ -3354,7 +3883,9 @@ "url": "https://aka.ms/presidio", "image": "https://raw.githubusercontent.com/microsoft/presidio/master/docs/assets/before-after.png", "github": "microsoft/presidio", - "category": ["standalone"], + "category": [ + "standalone" + ], "thumb": "https://avatars0.githubusercontent.com/u/6154722", "author": "Microsoft", "author_links": { @@ -3368,7 +3899,9 @@ "description": "This package features data-science related tasks for developing new recognizers for Microsoft Presidio. It is used for the evaluation of the entire system, as well as for evaluating specific PII recognizers or PII detection models. Anyone interested in evaluating an existing Microsoft Presidio instance, a specific PII recognizer or to develop new models or logic for detecting PII could leverage the preexisting work in this package. Additionally, anyone interested in generating new data based on previous datasets (e.g. to increase the coverage of entity values) for Named Entity Recognition models could leverage the data generator contained in this package.", "url": "https://aka.ms/presidio-research", "github": "microsoft/presidio-research", - "category": ["standalone"], + "category": [ + "standalone" + ], "thumb": "https://avatars0.githubusercontent.com/u/6154722", "author": "Microsoft", "author_links": { @@ -3382,8 +3915,12 @@ "github": "nipunsadvilkar/pySBD", "description": "pySBD is 'real-world' sentence segmenter which extracts reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).", "pip": "pysbd", - "category": ["scientific"], - "tags": ["sentence segmentation"], + "category": [ + "scientific" + ], + "tags": [ + "sentence segmentation" + ], "code_example": [ "from pysbd.utils import PySBDFactory", "", @@ -3410,7 +3947,9 @@ "url": "https://github.com/microsoft/cookiecutter-spacy-fastapi", "image": "https://raw.githubusercontent.com/microsoft/cookiecutter-spacy-fastapi/master/images/cookiecutter-docs.png", "github": "microsoft/cookiecutter-spacy-fastapi", - "category": ["apis"], + "category": [ + "apis" + ], "thumb": "https://avatars0.githubusercontent.com/u/6154722", "author": "Microsoft", "author_links": { @@ -3424,8 +3963,13 @@ "github": "yash1994/dframcy", "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.", "pip": "dframcy", - "category": ["pipeline", "training"], - "tags": ["pandas"], + "category": [ + "pipeline", + "training" + ], + "tags": [ + "pandas" + ], "code_example": [ "import spacy", "from dframcy import DframCy", @@ -3482,8 +4026,16 @@ "github": "ceteri", "website": "https://derwen.ai/paco" }, - "category": ["pipeline"], - "tags": ["phrase extraction", "ner", "summarization", "graph algorithms", "textrank"] + "category": [ + "pipeline" + ], + "tags": [ + "phrase extraction", + "ner", + "summarization", + "graph algorithms", + "textrank" + ] }, { "id": "spacy_syllables", @@ -3509,8 +4061,13 @@ "author_links": { "github": "sloev" }, - "category": ["pipeline"], - "tags": ["syllables", "multilingual"] + "category": [ + "pipeline" + ], + "tags": [ + "syllables", + "multilingual" + ] }, { "id": "sentimental-onix", @@ -3554,8 +4111,13 @@ "author_links": { "github": "sloev" }, - "category": ["pipeline"], - "tags": ["sentiment", "english"] + "category": [ + "pipeline" + ], + "tags": [ + "sentiment", + "english" + ] }, { "id": "gobbli", @@ -3593,7 +4155,9 @@ "", "predict_output = clf.predict(predict_input)" ], - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "spacy_fastlang", @@ -3616,7 +4180,9 @@ "author_links": { "github": "thomasthiebaud" }, - "category": ["pipeline"] + "category": [ + "pipeline" + ] }, { "id": "mlflow", @@ -3634,7 +4200,10 @@ "twitter": "databricks", "website": "https://databricks.com/" }, - "category": ["standalone", "apis"], + "category": [ + "standalone", + "apis" + ], "code_example": [ "import mlflow", "import mlflow.spacy", @@ -3687,8 +4256,13 @@ "github": "kevinlu1248", "website": "https://github.com/kevinlu1248/pyate" }, - "category": ["pipeline", "research"], - "tags": ["term_extraction"] + "category": [ + "pipeline", + "research" + ], + "tags": [ + "term_extraction" + ] }, { "id": "contextualSpellCheck", @@ -3717,8 +4291,18 @@ "github": "r1j1t", "website": "https://github.com/R1j1t" }, - "category": ["pipeline", "conversational", "research"], - "tags": ["spell check", "correction", "preprocessing", "translation", "correction"] + "category": [ + "pipeline", + "conversational", + "research" + ], + "tags": [ + "spell check", + "correction", + "preprocessing", + "translation", + "correction" + ] }, { "id": "texthero", @@ -3744,7 +4328,9 @@ "github": "jbesomi", "website": "https://besomi.ai" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "cov-bsv", @@ -3763,8 +4349,18 @@ "print(doc._.cov_classification)", "cov_bsv.visualize_doc(doc)" ], - "category": ["pipeline", "standalone", "biomedical", "scientific"], - "tags": ["clinical", "epidemiology", "covid-19", "surveillance"], + "category": [ + "pipeline", + "standalone", + "biomedical", + "scientific" + ], + "tags": [ + "clinical", + "epidemiology", + "covid-19", + "surveillance" + ], "author": "Alec Chapman", "author_links": { "github": "abchapman93" @@ -3792,8 +4388,14 @@ " print(ent, ent._.is_negated, ent._.is_family, ent._.is_historical)", "medspacy.visualization.visualize_ent(doc)" ], - "category": ["biomedical", "scientific", "research"], - "tags": ["clinical"], + "category": [ + "biomedical", + "scientific", + "research" + ], + "tags": [ + "clinical" + ], "author": "medspacy", "author_links": { "github": "medspacy" @@ -3828,8 +4430,15 @@ "r = nlp(\"She was wearing a short wide-cut dress\")", "print(list([{\"label\": e.label_, \"text\": e.text} for e in r.ents]))" ], - "category": ["standalone"], - "tags": ["dsl", "language-patterns", "language-rules", "nlp"], + "category": [ + "standalone" + ], + "tags": [ + "dsl", + "language-patterns", + "language-rules", + "nlp" + ], "author": "Šarūnas Navickas", "author_links": { "github": "zaibacu" @@ -3858,8 +4467,15 @@ "author_links": { "github": "revuel" }, - "category": ["scientific", "research", "standalone"], - "tags": ["Evolutionary Computation", "Grammatical Evolution"] + "category": [ + "scientific", + "research", + "standalone" + ], + "tags": [ + "Evolutionary Computation", + "Grammatical Evolution" + ] }, { "id": "SpacyDotNet", @@ -3913,7 +4529,9 @@ "author_links": { "github": "AMArostegui" }, - "category": ["nonpython"] + "category": [ + "nonpython" + ] }, { "id": "ruts", @@ -3939,8 +4557,14 @@ "twitter": "shk_sergey", "github": "SergeyShk" }, - "category": ["pipeline", "standalone"], - "tags": ["Text Analytics", "Russian"] + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "Text Analytics", + "Russian" + ] }, { "id": "trunajod", @@ -3974,8 +4598,16 @@ "author_links": { "github": "dpalmasan" }, - "category": ["research", "standalone", "scientific"], - "tags": ["Text Analytics", "Coherence", "Cohesion"] + "category": [ + "research", + "standalone", + "scientific" + ], + "tags": [ + "Text Analytics", + "Coherence", + "Cohesion" + ] }, { "id": "lingfeat", @@ -4033,7 +4665,10 @@ "github": "brucewlee", "website": "https://brucewlee.github.io/" }, - "category": ["research", "scientific"], + "category": [ + "research", + "scientific" + ], "tags": [ "Readability", "Simplification", @@ -4118,8 +4753,17 @@ "twitter": "bodak", "website": "https://github.com/babylonhealth/" }, - "category": ["pipeline", "standalone", "scientific", "biomedical"], - "tags": ["babylonhealth", "rule-engine", "matcher"] + "category": [ + "pipeline", + "standalone", + "scientific", + "biomedical" + ], + "tags": [ + "babylonhealth", + "rule-engine", + "matcher" + ] }, { "id": "forte", @@ -4150,8 +4794,13 @@ "github": "asyml", "website": "https://petuum.com" }, - "category": ["pipeline", "standalone"], - "tags": ["pipeline"] + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "pipeline" + ] }, { "id": "spacy-api-docker-v3", @@ -4174,7 +4823,9 @@ "author_links": { "github": "bbieniek" }, - "category": ["apis"] + "category": [ + "apis" + ] }, { "id": "phruzz_matcher", @@ -4217,8 +4868,17 @@ "twitter": "vallotin", "website": "https://fiqus.coop/" }, - "category": ["pipeline", "research", "standalone"], - "tags": ["spacy", "python", "nlp", "ner"] + "category": [ + "pipeline", + "research", + "standalone" + ], + "tags": [ + "spacy", + "python", + "nlp", + "ner" + ] }, { "id": "WordDumb", @@ -4233,7 +4893,9 @@ "author_links": { "github": "xxyzz" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "eng_spacysentiment", @@ -4258,8 +4920,14 @@ "github": "Vishnunkumar", "twitter": "vishnun_uchiha" }, - "category": ["pipeline"], - "tags": ["pipeline", "nlp", "sentiment"] + "category": [ + "pipeline" + ], + "tags": [ + "pipeline", + "nlp", + "sentiment" + ] }, { "id": "textnets", @@ -4282,7 +4950,10 @@ "github": "jboynyc", "twitter": "jboy" }, - "category": ["visualizers", "standalone"] + "category": [ + "visualizers", + "standalone" + ] }, { "id": "tmtoolkit", @@ -4318,7 +4989,10 @@ "github": "internaut", "twitter": "_knrd" }, - "category": ["scientific", "standalone"] + "category": [ + "scientific", + "standalone" + ] }, { "id": "edsnlp", @@ -4359,8 +5033,15 @@ "github": "aphp", "website": "https://github.com/aphp" }, - "category": ["biomedical", "scientific", "research", "pipeline"], - "tags": ["clinical"] + "category": [ + "biomedical", + "scientific", + "research", + "pipeline" + ], + "tags": [ + "clinical" + ] }, { "id": "sent-pattern", @@ -4374,8 +5055,13 @@ "twitter": "ExZ79575296", "github": "lll-lll-lll-lll" }, - "category": ["pipeline"], - "tags": ["interpretation", "ja"] + "category": [ + "pipeline" + ], + "tags": [ + "interpretation", + "ja" + ] }, { "id": "spacy-partial-tagger", @@ -4384,7 +5070,10 @@ "description": "This is a library to build a CRF tagger with a partially annotated dataset in spaCy. You can build your own tagger only from dictionary.", "github": "doccano/spacy-partial-tagger", "pip": "spacy-partial-tagger", - "category": ["pipeline", "training"], + "category": [ + "pipeline", + "training" + ], "author": "Yasufumi Taniguchi", "author_links": { "github": "yasufumy" @@ -4414,8 +5103,13 @@ "github": "wannaphong", "website": "https://iam.wannaphong.com/" }, - "category": ["pipeline", "research"], - "tags": ["Thai"] + "category": [ + "pipeline", + "research" + ], + "tags": [ + "Thai" + ] }, { "id": "vetiver", @@ -4445,8 +5139,14 @@ "github": "rstudio", "website": "https://posit.co/" }, - "category": ["apis", "standalone"], - "tags": ["apis", "deployment"] + "category": [ + "apis", + "standalone" + ], + "tags": [ + "apis", + "deployment" + ] }, { "id": "span_marker", @@ -4476,8 +5176,14 @@ "github": "tomaarsen", "website": "https://www.linkedin.com/in/tomaarsen" }, - "category": ["pipeline", "standalone", "scientific"], - "tags": ["ner"] + "category": [ + "pipeline", + "standalone", + "scientific" + ], + "tags": [ + "ner" + ] }, { "id": "hobbit-spacy", @@ -4501,8 +5207,15 @@ "github": "wjbmattingly", "website": "https://wjbmattingly.com" }, - "category": ["pipeline", "standalone"], - "tags": ["spans", "rules", "ner"] + "category": [ + "pipeline", + "standalone" + ], + "tags": [ + "spans", + "rules", + "ner" + ] }, { "id": "rolegal", @@ -4512,7 +5225,12 @@ "description": "This is a spaCy language model for Romanian legal domain trained with floret 4-gram to 5-gram embeddings and `LEGAL` entity recognition. Useful for processing OCR-resulted noisy legal documents.", "github": "senisioi/rolegal", "pip": "ro-legal-fl", - "tags": ["legal", "floret", "ner", "romanian"], + "tags": [ + "legal", + "floret", + "ner", + "romanian" + ], "code_example": [ "import spacy", "nlp = spacy.load(\"ro_legal_fl\")", @@ -4533,7 +5251,11 @@ "github": "senisioi", "website": "https://nlp.unibuc.ro/people/snisioi.html" }, - "category": ["pipeline", "training", "models"] + "category": [ + "pipeline", + "training", + "models" + ] }, { "id": "redfield-spacy-nodes", @@ -4550,7 +5272,9 @@ "github": "Redfield-AB", "website": "https://redfield.ai" }, - "category": ["standalone"] + "category": [ + "standalone" + ] }, { "id": "quelquhui", @@ -4569,8 +5293,13 @@ "author_links": { "github": "thjbdvlt" }, - "category": ["pipeline"], - "tags": ["tokenizer", "french"] + "category": [ + "pipeline" + ], + "tags": [ + "tokenizer", + "french" + ] }, { "id": "gliner-spacy", @@ -4596,11 +5325,44 @@ "author_links": { "website": "https://theirstory.io" }, - "category": ["pipeline"], - "tags": ["NER"] + "category": [ + "pipeline" + ], + "tags": [ + "NER" + ] + }, + { + "id": "presque", + "title": "presque", + "slogan": "Normalizer for contemporary French", + "description": "Normalizer for French with focus on online and informal communication, _peùUUUt-èTRE_ becomes _peut-être_, _voilaaaa_ becomes _voilà_. it also harmonizes inclusive language (the user can chose how): by default, _auteur-rice-s-x et relecteur.xrices_ becomes _auteur·ricexs et relecteur·ricexs_.", + "github": "thjbdvlt/presque", + "code_example": [ + "import spacy", + "import presque", + "", + "@spacy.Language.factory('presque_normalizer')", + "def create_presque_normalizer(nlp, name='presque_normalizer'):", + "return presque.Normalizer(nlp=nlp)", + "", + "nlp = spacy.load('fr_core_news_lg')", + "nlp.add_pipe('presque_normalizer', first=True)" + ], + "code_language": "python", + "author": "thjbdvlt", + "author_links": { + "github": "thjbdvlt" + }, + "category": [ + "pipeline" + ], + "tags": [ + "normalizer", + "french" + ] } ], - "categories": [ { "label": "Projects", From 89c1774d43712bf26e1df821638ac9e168bf0e26 Mon Sep 17 00:00:00 2001 From: William Mattingly <62964060+wjbmattingly@users.noreply.github.com> Date: Tue, 10 Sep 2024 08:24:06 -0400 Subject: [PATCH 28/29] added bagpipes-spacy to universe (#13425) [ci skip] Co-authored-by: Ines Montani --- website/meta/universe.json | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index fa71ac2041e..adef0fead74 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -5361,6 +5361,49 @@ "normalizer", "french" ] + }, + { + "id": "bagpipes-spacy", + "title": "Bagpipes spaCy", + "slogan": "A bag of custom spaCy pipes for various NLP tasks.", + "description": "Bagpipes spaCy is a versatile collection of custom spaCy pipeline components enhancing text processing capabilities. It includes functionalities such as phrase extraction, text normalization, triple detection, entity and sentence clustering, token clustering, and keyword extraction. These components augment NLP tasks with advanced processing and analysis features, offering a comprehensive toolkit for natural language data handling.", + "github": "wjbmattingly/bagpipes-spacy", + "pip": "bagpipes-spacy", + "code_example": [ + "import spacy", + "from bagpipes_spacy import PhrasesExtractor", + "nlp = spacy.load(\"en_core_web_md\")", + "nlp.add_pipe(\"phrases_extractor\")", + "text = 'Seconds later, he had climbed out onto a rather fine antique rug, brushing ash from the sleeves of his long pin-striped cloak, a lime-green bowler hat in his hand.'", + "doc = nlp(text)", + "print('Prepositional Phrases')", + "print(doc._.prep_phrases)", + "print('Noun Phrases')", + "print(doc._.noun_phrases)", + "print('Verb Phrases')", + "print(doc._.verb_phrases)", + "print('Adj Phrases')", + "print(doc._.adj_phrases)" + ], + "code_language": "python", + "url": "https://github.com/wjbmattingly/bagpipes-spacy", + "thumb": "https://github.com/wjbmattingly/bagpipes-spacy/raw/main/images/bagpipes-spacy-icon.png?raw=true", + "image": "https://github.com/wjbmattingly/bagpipes-spacy/raw/main/images/bagpipes-spacy-logo.png?raw=true", + "author": "W.J.B. Mattingly", + "author_links": { + "twitter": "wjb_mattingly", + "github": "wjbmattingly", + "website": "https://www.wjbmattingly.com" + }, + "category": [ + "pipeline" + ], + "tags": [ + "spacy", + "text processing", + "NLP", + "custom components" + ] } ], "categories": [ From 7fbbb2002ac9e8e3b4ce05d9bc5dcef8b4aa80f0 Mon Sep 17 00:00:00 2001 From: William Mattingly <62964060+wjbmattingly@users.noreply.github.com> Date: Tue, 10 Sep 2024 08:25:23 -0400 Subject: [PATCH 29/29] updated universe for number spacy (#13424) [ci skip] Co-authored-by: Ines Montani --- website/meta/universe.json | 40 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index adef0fead74..9a0e94bb7d8 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -5404,6 +5404,46 @@ "NLP", "custom components" ] + }, + { + "id": "number-spacy", + "title": "Number spaCy", + "slogan": "Enhancing Numeric Entity Recognition in Text with spaCy", + "description": "Number spaCy is a custom spaCy pipeline component that enhances the identification of number entities in text and fetches the parsed numeric values using spaCy's token extensions. It uses RegEx to identify number entities written in words and then leverages the [word2number](https://github.com/akshaynagpal/w2n) library to convert those words into structured numeric data. The output numeric value is stored in a custom entity extension: `._.number`. This lightweight component can be seamlessly added to an existing spaCy pipeline or integrated into a blank model. If using within an existing spaCy pipeline, ensure to insert it before the NER model.", + "github": "wjbmattingly/number-spacy", + "pip": "number-spacy", + "code_example": [ + "import spacy", + "from number_spacy import find_numbers", + "", + "nlp = spacy.blank('en')", + "nlp.add_pipe('find_numbers')", + "", + "doc = nlp('I have three apples. She gave me twenty-two more, and now I have twenty-five apples in total.')", + "", + "for ent in doc.ents:", + " if ent.label_ == 'NUMBER':", + " print(f'Text: {ent.text} -> Parsed Number: {ent._.number}')" + ], + "code_language": "python", + "url": "https://github.com/wjbmattingly/number-spacy", + "thumb": "https://github.com/wjbmattingly/number-spacy/raw/main/images/number-spacy-logo.png?raw=true", + "image": "https://github.com/wjbmattingly/number-spacy/raw/main/images/number-spacy-logo.png?raw=true", + "author": "W.J.B. Mattingly", + "author_links": { + "twitter": "wjb_mattingly", + "github": "wjbmattingly", + "website": "https://www.wjbmattingly.com" + }, + "category": [ + "pipeline" + ], + "tags": [ + "spacy", + "number", + "NLP", + "entity recognition" + ] } ], "categories": [