From 1c06c83d95f70c4d3d30bdd17b309444fab4a8eb Mon Sep 17 00:00:00 2001 From: LE Van Tuan Date: Wed, 16 Sep 2020 01:04:23 +0200 Subject: [PATCH] init --- .gitignore | 2 + README.md | 98 +++- people_also_ask/__init__.py | 8 + people_also_ask/exceptions.py | 67 +++ people_also_ask/google.py | 183 +++++++ people_also_ask/parser.py | 448 ++++++++++++++++++ .../tests/fixtures/cheetah_vs_lion.html | 206 ++++++++ .../tests/fixtures/gangnam_style.html | 206 ++++++++ .../how_to_make_a_cold_brew_coffee.html | 208 ++++++++ ...0_highest-grossing_movies_of_all_time.html | 204 ++++++++ ...hat_are_3_basic_programming_languages.html | 204 ++++++++ .../tests/fixtures/what_time_is_it.html | 203 ++++++++ .../fixtures/why_was_ho_chi_minh_a_hero.html | 206 ++++++++ .../world_university_rankings_2019.html | 207 ++++++++ people_also_ask/tests/test_google.py | 31 ++ people_also_ask/tests/test_parser.py | 52 ++ people_also_ask/tests/where_is_france | 205 ++++++++ people_also_ask/tests/who_is_ho_chi_minh? | 208 ++++++++ people_also_ask/tools.py | 83 ++++ setup.py | 20 + 20 files changed, 3047 insertions(+), 2 deletions(-) create mode 100644 .gitignore mode change 100644 => 100755 README.md create mode 100755 people_also_ask/__init__.py create mode 100644 people_also_ask/exceptions.py create mode 100755 people_also_ask/google.py create mode 100644 people_also_ask/parser.py create mode 100644 people_also_ask/tests/fixtures/cheetah_vs_lion.html create mode 100644 people_also_ask/tests/fixtures/gangnam_style.html create mode 100644 people_also_ask/tests/fixtures/how_to_make_a_cold_brew_coffee.html create mode 100644 people_also_ask/tests/fixtures/the_10_highest-grossing_movies_of_all_time.html create mode 100644 people_also_ask/tests/fixtures/what_are_3_basic_programming_languages.html create mode 100644 people_also_ask/tests/fixtures/what_time_is_it.html create mode 100644 people_also_ask/tests/fixtures/why_was_ho_chi_minh_a_hero.html create mode 100644 people_also_ask/tests/fixtures/world_university_rankings_2019.html create mode 100644 people_also_ask/tests/test_google.py create mode 100644 people_also_ask/tests/test_parser.py create mode 100644 people_also_ask/tests/where_is_france create mode 100644 people_also_ask/tests/who_is_ho_chi_minh? create mode 100755 people_also_ask/tools.py create mode 100755 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c49f188 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*_pycache_* +*.swp diff --git a/README.md b/README.md old mode 100644 new mode 100755 index ad68290..c3bea9a --- a/README.md +++ b/README.md @@ -1,2 +1,96 @@ -# people_also_ask -Python wrapper for google people-alos-ask +# Google-Play-Scraper + +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black) +[![PyPI](https://img.shields.io/pypi/v/people_also_ask.svg)](https://pypi.org/project/people-also-ask) +[![downloads](https://img.shields.io/pypi/dm/people_also_ask.svg)](https://pypistats.org/packages/people-also-ask) +[![versions](https://img.shields.io/pypi/pyversions/people_also_ask.svg)](https://github.com/lagranges/people_also_ask) + +People-also-ask provides APIs to easily crawl the data of google featured snippet. + +## ⚠ Warning +Search engines like Google do not allow any sort of automated access to their service but from a legal point of view there is no known case or broken law. Google does not take legal action against scraping, likely for self-protective reasons. +API have been configured to not abuse google search engine. + +## Installation +``` +pip install people_also_ask +``` + +## Usage +Goal of ``people_also_ask`` is to provide simple and easy to use API for retrieving informations from Google Featured Snippet. + +### Importing +```python +import people_also_ask +``` + +### How to get related questions +```python +people_also_ask.get_related_questions("coffee") + +['Is coffee good for your health?', + 'Why is coffee bad for you?', + 'Who invented coffee?', + 'What do u know about coffee?'] +``` + +### How to get more questions +```python +people_also_ask.get_related_questions("coffee", 5) + +['How did coffee originate?', + 'Is coffee good for your health?', + 'Who brought coffee America?', + 'Who invented coffee?', + 'Why is coffee bad for you?', + 'Why is drinking coffee bad for you?'] +``` + +### Generate unlimited questions +```python +for question in people_also_ask.generate_related_questions("cofee") + +Why is coffee bad for you? +Who invented coffee? +Is coffee good for your health? +Who brought coffee America? +How did coffee originate? +Why is drinking coffee bad for you? +.... +``` + +### Get answer for a question +```python +people_also_ask.get_answer("Why is coffee bad for you?") + +{'has_answer': True, + 'question': 'Why is coffee bad for you?', + 'related_questions': ['Why is drinking coffee bad for you?', + 'Is coffee good for your health?', + 'Is coffee toxic to your body?', + 'What does coffee do to your body?'], + 'response': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018', + 'heading': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018', + 'title': 'Coffee — Good or Bad? - Healthline', + 'link': 'https://www.healthline.com/nutrition/coffee-good-or-bad#:~:text=Consuming%20too%20much%20caffeine%20can,can%20disrupt%20sleep%20(%2035%20).', + 'displayed_link': 'www.healthline.com › nutrition › coffee-good-or-bad', + 'snippet_str': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018\nwww.healthline.com › nutrition › coffee-good-or-bad\nhttps://www.healthline.com/nutrition/coffee-good-or-bad#:~:text=Consuming%20too%20much%20caffeine%20can,can%20disrupt%20sleep%20(%2035%20).\nCoffee — Good or Bad? - Healthline', + 'snippet_data': None, + 'date': None, + 'snippet_type': 'Definition Featured Snippet', + 'snippet_str_body': '', + 'raw_text': 'Featured snippet from the web\nConsuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If \nyou\n are sensitive to caffeine and tend to become overstimulated, \n may want to avoid \ncoffee\n altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).\nAug 30, 2018\nCoffee — Good or Bad? - Healthline\nwww.healthline.com\n › nutrition › coffee-good-or-bad'} +``` + +### Get Simple Answer for a question +```python +people_also_ask.get_simple_answer("Why is coffee bad for you?") + +'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018' +``` + + +### Generate questions and answer around a subject +```python +people_also_ask.generate_answer("coffee") +``` diff --git a/people_also_ask/__init__.py b/people_also_ask/__init__.py new file mode 100755 index 0000000..a7e4c94 --- /dev/null +++ b/people_also_ask/__init__.py @@ -0,0 +1,8 @@ +#! /usr/bin/env python3 +from people_also_ask.google import ( + get_answer, + generate_answer, + get_simple_answer, + get_related_questions, + generate_related_questions, +) diff --git a/people_also_ask/exceptions.py b/people_also_ask/exceptions.py new file mode 100644 index 0000000..bd0c979 --- /dev/null +++ b/people_also_ask/exceptions.py @@ -0,0 +1,67 @@ +#! /usr/bin/env python3 +""" +Global realted-questions exception and warning classes. +""" + + +GITHUB_LINK = "Github" + + +class RelatedQuestionError(Exception): + """Base Related-Questions exception class.""" + + def __init__(self, error): + self.error = error + + def __unicode__(self): + return ( + f'An unkown error occured: {self.error}.' + f' Please report it on {GITHUB_LINK}.' + ) + + +class FeaturedSnippetParserError(RelatedQuestionError): + """ + Exception raised when failed to get answer from + search result page + """ + + def __init__(self, text): + self.keyword = text + + def __unicode__(self): + return ( + f"Cannot parse result page of '{self.text}'." + f" It mays due to a format change of result page." + f' Please report it on {GITHUB_LINK}.' + ) + + +class RelatedQuestionParserError(RelatedQuestionError): + """ + Exception raised when failed to get related questions + from search result page + """ + + def __init__(self, text): + self.keyword = text + + def __unicode__(self): + return ( + f"Cannot parse result page of '{self.text}'." + f" It mays due to a format change of result page." + f' Please report it on {GITHUB_LINK}.' + ) + + +class GoogleSearchRequestFailedError(RelatedQuestionError): + """Exception raised when failed to request search on google""" + + def __init__(self, url, keyword): + self.url = url + self.keyword = keyword + + def __unicode__(self): + return ( + f"Failed to requests {self.url}/{self.keyword}" + ) diff --git a/people_also_ask/google.py b/people_also_ask/google.py new file mode 100755 index 0000000..f9e7158 --- /dev/null +++ b/people_also_ask/google.py @@ -0,0 +1,183 @@ +#! /usr/bin/env python3 +import os +import sys +import time +import logging +import requests +from bs4 import BeautifulSoup +from typing import List, Dict, Any, Optional, Generator + +from people_also_ask.tools import retryable +from people_also_ask.parser import ( + extract_related_questions, + get_featured_snippet_parser, +) +from people_also_ask.exceptions import ( + GoogleSearchRequestFailedError, + RelatedQuestionParserError, + FeaturedSnippetParserError +) +from people_also_ask.tools import CallingSemaphore + + +URL = "https://www.google.com/search" +HEADERS = { + 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" + " AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/84.0.4147.135 Safari/537.36" +} +SESSION = requests.Session() +NB_TIMES_RETRY = 3 +NB_REQUESTS_LIMIT = os.environ.get( + "RELATED_QUESTION_NBREQUESTS_LIMIT", 25 +) +NB_REQUESTS_DURATION_LIMIT = os.environ.get( + "RELATED_QUESTION_DURATION_LIMIT", 60 # seconds +) +logging.basicConfig() +semaphore = CallingSemaphore( + NB_REQUESTS_LIMIT, NB_REQUESTS_DURATION_LIMIT +) + + +@retryable(3) +def search(keyword: str) -> Optional[BeautifulSoup]: + """return html parser of google search result""" + params = {"q": keyword} + try: + with semaphore: + time.sleep(0.5) # be nice with google :) + response = SESSION.get(URL, params=params, headers=HEADERS) + except Exception: + raise GoogleSearchRequestFailedError(URL, keyword) + if response.status_code != 200: + raise GoogleSearchRequestFailedError(URL, keyword) + return BeautifulSoup(response.text, "html.parser") + + +def _get_related_questions(text: str) -> List[str]: + """ + return a list of questions related to text. + These questions are from search result of text + + :param str text: text to search + """ + document = search(text) + if not document: + return [] + try: + return extract_related_questions(document) + except Exception: + raise RelatedQuestionParserError(text) + + +def generate_related_questions(text: str) -> Generator[str, None, None]: + """ + generate the questions related to text, + these quetions are found recursively + + :param str text: text to search + """ + questions = set(_get_related_questions(text)) + searched_text = set(text) + while questions: + text = questions.pop() + yield text + searched_text.add(text) + questions |= set(_get_related_questions(text)) + questions -= searched_text + + +def get_related_questions(text: str, max_nb_questions: Optional[int] = None): + """ + return a number of questions related to text. + These questions are found recursively. + + :param str text: text to search + """ + if max_nb_questions is None: + return _get_related_questions(text) + nb_question_regenerated = 0 + questions = set() + for question in generate_related_questions(text): + if nb_question_regenerated > max_nb_questions: + break + questions.add(question) + nb_question_regenerated += 1 + return list(questions) + + +def get_answer(question: str) -> Dict[str, Any]: + """ + return a dictionary as answer for a question. + + :param str question: asked question + """ + document = search(question) + related_questions = extract_related_questions(document) + featured_snippet = get_featured_snippet_parser( + question, document) + if not featured_snippet: + res = dict( + has_answer=False, + question=question, + related_questions=related_questions, + ) + else: + res = dict( + has_answer=True, + question=question, + related_questions=related_questions, + ) + try: + res.update(featured_snippet.to_dict()) + except Exception: + raise FeaturedSnippetParserError(question) + return res + + +def generate_answer(text: str) -> Generator[dict, None, None]: + """ + generate answers of questions related to text + + :param str text: text to search + """ + answer = get_answer(text) + questions = set(answer["related_questions"]) + searched_text = set(text) + if answer["has_answer"]: + yield answer + while questions: + text = questions.pop() + answer = get_answer(text) + if answer["has_answer"]: + yield answer + searched_text.add(text) + questions |= set(get_answer(text)["related_questions"]) + questions -= searched_text + + +def get_simple_answer(question: str, depth: bool = False) -> str: + """ + return a text as summary answer for the question + + :param str question: asked quetion + :param bool depth: return the answer of first related question + if no answer found for question + """ + document = search(question) + featured_snippet = get_featured_snippet_parser( + question, document) + if featured_snippet: + return featured_snippet.response + if depth: + related_questions = get_related_questions(question) + if not related_questions: + return "" + return get_simple_answer(related_questions[0]) + return "" + + +if __name__ == "__main__": + from pprint import pprint as print + print(get_answer(sys.argv[1])) diff --git a/people_also_ask/parser.py b/people_also_ask/parser.py new file mode 100644 index 0000000..df117d4 --- /dev/null +++ b/people_also_ask/parser.py @@ -0,0 +1,448 @@ +#! /usr/bin/env python3 +from bs4.element import Tag +from bs4 import BeautifulSoup +from operator import attrgetter +from typing import List, Optional +from people_also_ask.tools import itemize, tabulate, remove_redundant + + +FEATURED_SNIPPET_ATTRIBUTES = [ + "response", "heading", "title", "link", "displayed_link", + "snippet_str", "snippet_data", "date", "snippet_data", + "snippet_type", "snippet_str_body", "raw_text" +] + + +def extract_related_questions(document: BeautifulSoup) -> List[str]: + div_questions = document.find_all("div", class_="related-question-pair") + get_text = attrgetter("text") + if not div_questions: + return [] + questions = list(map(get_text, div_questions)) + return questions + + +def is_ol_but_not_a_menu(tag): + return ( + tag.name == "ol" + and ( + not tag.has_attr("role") + or (tag.has_attr("role") and tag["role"] != "menu") + ) + ) + + +def get_tag_heading(tag): + return ( + tag.find("div", {"role": "heading", "aria-level": "3"}) + or tag.find("div", {"role": "heading"}) + ) + + +def has_youtube_link(tag): + youtube_links = tag.findAll( + lambda x: x.name == "a" and "youtube" in x.get("href", "") + ) + return bool(youtube_links) + + +def get_raw_text(tag): + return "\n".join(remove_redundant(tag.strings)) + + +def get_span_text(tag): + return "\n".join( + remove_redundant( + [e.text for e in tag.findAll("span") if e.text] + ) + ) + + +class FeaturedSnippetParser(object): + + def __init__(self, text: str, tag: Tag): + self.text = text + self.tag = tag + + def __getattr__(self, attr): + if attr in FEATURED_SNIPPET_ATTRIBUTES: + return None + raise AttributeError(f'{self.__class__.__name__}.{attr} is invalid.') + + @property + def raw_text(self): + return get_raw_text(self.tag) + + def to_dict(self): + return { + attr: getattr(self, attr) for attr in FEATURED_SNIPPET_ATTRIBUTES + } + + +class SimpleFeaturedSnippetParser(FeaturedSnippetParser): + + @classmethod + def get_instance(self, text, tag): + if tag.table is not None: + return TableFeaturedSnippetParser(text, tag) + if tag.findAll(is_ol_but_not_a_menu): + return OrderedFeaturedSnippetParser(text, tag) + if tag.ul is not None: + return UnorderedFeaturedSnippetParser(text, tag) + if get_tag_heading(tag): + return DefinitionFeaturedSnippetParser(text, tag) + if has_youtube_link(tag): + return YoutubeFeaturedSnippetParser(text, tag) + + @property + def tag_link(self): + if hasattr(self, "_tag_link"): + return self._tag_link + self._tag_link = self.tag.find( + lambda tag: ( + tag.name == "a" + and tag.has_attr("href") + and tag["href"].startswith("http") + and (tag.h3 or tag.h2) is not None + ) + ) + return self._tag_link + + @property + def link(self): + return self.tag_link["href"] if self.tag_link else None + + @property + def displayed_link(self): + return self.tag.cite.text if self.tag.cite else None + + @property + def title(self): + if self.tag_link is None: + return None + tag_title = self.tag_link.h3 or self.tag_link.h2 + return tag_title.text + + @property + def heading(self): + tag_heading = get_tag_heading(self.tag) + return tag_heading.text + + @property + def snippet_str(self): + lines = [] + for field in ( + "heading", "snippet_str_body", "snippet_str_body", + "displayed_link", "link", "title" + ): + if getattr(self, field): + lines.append(getattr(self, field)) + return "\n".join(lines) + + @property + def date(self): + return None + + @property + def snippet_data(self): + return None + + @property + def snippet_type(self): + return "Unknown Featured Snippet" + + @property + def snippet_str_body(self): + return "" + + +class TableFeaturedSnippetParser(SimpleFeaturedSnippetParser): + """Example: world university rankings 2019""" + + @property + def snippet_type(self): + return "Table Featured Snippet" + + @property + def snippet_str_body(self): + header = self.snippet_data["columns"] + table = self.snippet_data["values"] + return tabulate(header=header, table=table) + + @property + def response(self): + return self.snippet_str + + @property + def snippet_data(self): + table_tag = self.tag.find("table") + tr_tags = table_tag.findAll("tr") + if tr_tags[0].find("th"): + columns = [ + th_tag.text for th_tag in tr_tags[0].findAll("th") + ] + body_table_tags = tr_tags[1:] + else: + columns = None + body_table_tags = tr_tags + values = [ + [td_tag.text for td_tag in tr_tag.findAll("td")] + for tr_tag in body_table_tags + ] + if columns is None: + columns = list(range(len(values[0]))) + return { + "columns": columns, + "values": values + } + + +class OrderedFeaturedSnippetParser(SimpleFeaturedSnippetParser): + """Example: top grossing movies""" + + @property + def snippet_type(self): + return "Ordered Featured Snippet" + + @property + def response(self): + return self.snippet_str + + @property + def snippet_str_body(self): + return "\n".join(itemize(self.snippet_data)) + + @property + def snippet_data(self): + ol_tags = self.tag.find("ol") + li_tags = ol_tags.findAll("li") + return [tag.text for tag in li_tags] + + +class UnorderedFeaturedSnippetParser(SimpleFeaturedSnippetParser): + """ What are 3 basic programming languages? """ + + @property + def snippet_type(self): + return "Unordered Featured Snippet" + + @property + def snippet_str_body(self): + return "\n".join(itemize(self.snippet_data)) + + @property + def response(self): + return self.snippet_str + + @property + def snippet_data(self): + ul_tag = self.tag.find("ul") + li_tags = ul_tag.findAll("li") + return [tag.text for tag in li_tags] + + +class DefinitionFeaturedSnippetParser(SimpleFeaturedSnippetParser): + """Why was ho chi minh a hero""" + + @property + def snippet_type(self): + return "Definition Featured Snippet" + + @property + def response(self): + return self.heading + + +class YoutubeFeaturedSnippetParser(SimpleFeaturedSnippetParser): + """Ex: cheetah vs lion""" + + @property + def snippet_type(self): + return "Youtube Featured Snippet" + + @property + def heading(self): + return "" + + @property + def response(self): + return self.link + + +class MultipleCardsFeaturedSnippetTag(FeaturedSnippetParser): + """How to make a cold brew coffee""" + + @property + def heading(self): + tag_heading = ( + self.tag.find("h3", {"role": "heading"}) + or self.tag.find("h2", {"role": "heading"}) + ) + return tag_heading.text + + @property + def snippet_type(self): + return "Multiple Cards Featured Snippet Tag" + + def parse_card(self, tag_card): + return { + "heading": tag_card.find("div", {"role": "heading"}).text, + "title": tag_card.cite.text, + "link": tag_card.a["href"], + "raw_text": get_raw_text(tag_card), + } + + def str_card(self, card_data): + lines = [card_data["raw_text"]] + lines.append(f"Link: {card_data['link']}") + return "\n".join(lines) + + @property + def snippet_str(self): + if not self.snippet_data: + return "" + return "\n-------------\n".join(map(self.str_card, self.snippet_data)) + + @property + def snippet_data(self): + return list(map(self.parse_card, self.tag.findAll("g-inner-card"))) + + @property + def response(self): + return self.snippet_str + + +class SingleCardFeaturedSnippetParser(FeaturedSnippetParser): + """What time is it""" + + @property + def snippet_type(self): + return "Single Card FeaturedSnippet" + + @property + def heading(self): + tag_heading = get_tag_heading(self.tag) + return get_raw_text(tag_heading) + + @property + def response(self): + heading = self.heading + if heading: + return heading + return self.raw_text + + @property + def raw_text(self): + return get_span_text(self.tag) + + +class WholePageTabContainer(FeaturedSnippetParser): + """Gangnam Style""" + + @property + def snippet_type(self): + return "Whole Page Tab Container" + + @property + def tag_link(self): + if hasattr(self, "_tag_link"): + return self._tag_link + self._tag_link = self.tag.find( + lambda tag: ( + tag.name == "a" + and tag.has_attr("href") + and tag["href"].startswith("http") + and (tag.h3 or tag.h2) is not None + ) + ) + return self._tag_link + + @property + def link(self): + return self.tag_link["href"] if self.tag_link else None + + @property + def displayed_link(self): + return self.tag.cite.text if self.tag.cite else None + + @property + def title(self): + if self.tag_link is None: + return None + tag_title = self.tag_link.h3 or self.tag_link.h2 + return tag_title.text + + @property + def response(self): + return self.raw_text + + @property + def raw_text(self): + return get_span_text(self.tag) + + +def is_simple_featured_snippet_tag(tag): + class_tuple = tuple(tag.get("class", "")) + is_xpdopen = (tag.name == "div" and class_tuple == ("xpdopen",)) + if not is_xpdopen: + return False + is_xpdopen_of_related_questions = ( + tag.h2 is not None and tag.h2.text == "People also ask" + ) + return not is_xpdopen_of_related_questions + + +def is_single_card_featured_snippet_tag(tag): + is_card_section = ( + tag.name == "div" and "card-section" in tag.get("class", []) + ) + if not is_card_section: + return False + is_card_section_of_tip = tag.text.startswith("Tip:") + return not is_card_section_of_tip + + +def is_multiple_card_snippet_tag(tag): + return (tag.name == "g-section-with-header") + + +def is_whole_page_tabs_container(tag): + return (tag.get("id") == "wp-tabs-container") + + +def is_web_results(tag): + return (tag.name == "h2" and tag.text == "Web results") + + +def get_featured_snippet_tag(document): + + def lookup_featured_snippet_tag(tag): + return ( + is_simple_featured_snippet_tag(tag) + or is_single_card_featured_snippet_tag(tag) + or is_multiple_card_snippet_tag(tag) + or is_web_results(tag) + ) + whole_page_tag = document.find(is_whole_page_tabs_container) + tag = document.find(lookup_featured_snippet_tag) + if tag and is_simple_featured_snippet_tag(tag): + return tag + if whole_page_tag: + return whole_page_tag + if not tag or tag.name == "h2": + return None + return tag + + +def get_featured_snippet_parser(question, document: BeautifulSoup): + tag = get_featured_snippet_tag(document) + if tag is None: + return + if is_simple_featured_snippet_tag(tag): + return SimpleFeaturedSnippetParser.get_instance(question, tag) + if is_multiple_card_snippet_tag(tag): + return MultipleCardsFeaturedSnippetTag(question, tag) + if is_single_card_featured_snippet_tag(tag): + return SingleCardFeaturedSnippetParser(question, tag) + if is_whole_page_tabs_container(tag): + return WholePageTabContainer(question, tag) diff --git a/people_also_ask/tests/fixtures/cheetah_vs_lion.html b/people_also_ask/tests/fixtures/cheetah_vs_lion.html new file mode 100644 index 0000000..b9c99fa --- /dev/null +++ b/people_also_ask/tests/fixtures/cheetah_vs_lion.html @@ -0,0 +1,206 @@ +cheetah vs lion - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 29,600,000 results (0.51 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/gangnam_style.html b/people_also_ask/tests/fixtures/gangnam_style.html new file mode 100644 index 0000000..515649f --- /dev/null +++ b/people_also_ask/tests/fixtures/gangnam_style.html @@ -0,0 +1,206 @@ +Gangnam style - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 28,500,000 results (0.63 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/how_to_make_a_cold_brew_coffee.html b/people_also_ask/tests/fixtures/how_to_make_a_cold_brew_coffee.html new file mode 100644 index 0000000..1d93944 --- /dev/null +++ b/people_also_ask/tests/fixtures/how_to_make_a_cold_brew_coffee.html @@ -0,0 +1,208 @@ +How to make a cold brew coffee - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 60,200,000 results (0.60 seconds) 

Ads

  1. Meet your New Saturday Staple, the Jameson® Cold Brew Dalgona Coffee. Delicious, Creamy, and Simple to Make at Home. Try One Today. Exceptional Smoothness. Triple Distilled.

    Coffee Lovers' Whiskey
    Distinctive, Bold and Natural

    Explore the Range of Jameson®
    Irish Whiskeys. See Our Whiskeys.

    People also search for

Ads

  1. Learn How to Make Rich, Full-Bodied & Smooth Cold Brew Coffee In 3 Easy Steps Today. Our Cold Brew Ground Coffee Packs Are Easy to Make At Home. Enjoy Them Over Ice! Rich Roasts. Rich And Smooth Coffee. Ready to Brew. Brew In Minutes. Brew at Home.

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/the_10_highest-grossing_movies_of_all_time.html b/people_also_ask/tests/fixtures/the_10_highest-grossing_movies_of_all_time.html new file mode 100644 index 0000000..33c5237 --- /dev/null +++ b/people_also_ask/tests/fixtures/the_10_highest-grossing_movies_of_all_time.html @@ -0,0 +1,204 @@ +The 10 highest-grossing movies of all time - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 5,150,000 results (0.64 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/what_are_3_basic_programming_languages.html b/people_also_ask/tests/fixtures/what_are_3_basic_programming_languages.html new file mode 100644 index 0000000..d8bb9b8 --- /dev/null +++ b/people_also_ask/tests/fixtures/what_are_3_basic_programming_languages.html @@ -0,0 +1,204 @@ +What are 3 basic programming languages - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 1,100,000,000 results (0.62 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/what_time_is_it.html b/people_also_ask/tests/fixtures/what_time_is_it.html new file mode 100644 index 0000000..3d9d810 --- /dev/null +++ b/people_also_ask/tests/fixtures/what_time_is_it.html @@ -0,0 +1,203 @@ +What time is it - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 14,100,000,000 results (0.50 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/why_was_ho_chi_minh_a_hero.html b/people_also_ask/tests/fixtures/why_was_ho_chi_minh_a_hero.html new file mode 100644 index 0000000..7c26117 --- /dev/null +++ b/people_also_ask/tests/fixtures/why_was_ho_chi_minh_a_hero.html @@ -0,0 +1,206 @@ +why was ho chi minh a hero - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 5,950,000 results (0.64 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/fixtures/world_university_rankings_2019.html b/people_also_ask/tests/fixtures/world_university_rankings_2019.html new file mode 100644 index 0000000..597662a --- /dev/null +++ b/people_also_ask/tests/fixtures/world_university_rankings_2019.html @@ -0,0 +1,207 @@ +world university rankings 2019 - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 811,000,000 results (0.88 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/test_google.py b/people_also_ask/tests/test_google.py new file mode 100644 index 0000000..75b1a35 --- /dev/null +++ b/people_also_ask/tests/test_google.py @@ -0,0 +1,31 @@ +import unittest +from people_also_ask import google + + +config = dict( + test_get_answer=dict( + text="Who is Ho Chi Minh?" + ), + test_get_related_questions=dict( + text="where is france" + ) +) + + +class TestGoogle(unittest.TestCase): + + def test_get_answer(self): + answer = google.get_answer(config["test_get_answer"]["text"]) + self.assertIsNotNone(answer) + self.assertIsNotNone(answer["response"]) + + def test_get_related_questions(self): + related_questions = google.get_related_questions( + config["test_get_related_questions"]["text"] + ) + self.assertIsNotNone(related_questions) + self.assertTrue(len(related_questions) > 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/people_also_ask/tests/test_parser.py b/people_also_ask/tests/test_parser.py new file mode 100644 index 0000000..e090e58 --- /dev/null +++ b/people_also_ask/tests/test_parser.py @@ -0,0 +1,52 @@ +import os +import unittest +from bs4 import BeautifulSoup +from people_also_ask.parser import ( + get_featured_snippet_parser, + WholePageTabContainer, + TableFeaturedSnippetParser, + YoutubeFeaturedSnippetParser, + OrderedFeaturedSnippetParser, + UnorderedFeaturedSnippetParser, + DefinitionFeaturedSnippetParser, + MultipleCardsFeaturedSnippetTag, + SingleCardFeaturedSnippetParser, +) + + +HTMLS_PARSER = { + "cheetah_vs_lion.html": YoutubeFeaturedSnippetParser, + "gangnam_style.html": WholePageTabContainer, + "how_to_make_a_cold_brew_coffee.html": MultipleCardsFeaturedSnippetTag, + "the_10_highest-grossing_movies_of_all_time.html": ( + OrderedFeaturedSnippetParser + ), + "what_are_3_basic_programming_languages.html": ( + UnorderedFeaturedSnippetParser + ), + "what_time_is_it.html": SingleCardFeaturedSnippetParser, + "why_was_ho_chi_minh_a_hero.html": DefinitionFeaturedSnippetParser, + "world_university_rankings_2019.html": TableFeaturedSnippetParser +} +FIXTURES_DIR = os.path.join( + os.path.dirname(__file__), + "fixtures" +) + + +class TestParser(unittest.TestCase): + + def test_parsers(self): + for html_filename, Parser in HTMLS_PARSER.items(): + html_file = os.path.join(FIXTURES_DIR, html_filename) + with open(html_file, "r") as fd: + document = BeautifulSoup(fd.read(), "html.parser") + question, _ = html_filename.split(".") + question.replace("_", " ") + parser = get_featured_snippet_parser(question, document) + self.assertIsInstance(parser, Parser) + self.assertIsNotNone(parser.response) + + +if __name__ == "__main__": + unittest.main() diff --git a/people_also_ask/tests/where_is_france b/people_also_ask/tests/where_is_france new file mode 100644 index 0000000..97daac7 --- /dev/null +++ b/people_also_ask/tests/where_is_france @@ -0,0 +1,205 @@ +where is france - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 6,270,000,000 results (0.98 seconds) 

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tests/who_is_ho_chi_minh? b/people_also_ask/tests/who_is_ho_chi_minh? new file mode 100644 index 0000000..004384a --- /dev/null +++ b/people_also_ask/tests/who_is_ho_chi_minh? @@ -0,0 +1,208 @@ +Who is Ho Chi Minh? - Google Search

Accessibility Links

Skip to main contentAccessibility help
Accessibility feedback
About 374,000,000 results (0.76 seconds) 

Complementary Results

Ho Chi Minh

Former Prime Minister of Vietnam

Description

Description

Hồ Chí Minh, born Nguyễn Sinh Cung, also known as Nguyễn Tất Thành, Nguyễn Ái Quốc, Bác Hồ, or simply Bác, was a Vietnamese revolutionary and politician. He served as Prime Minister of Vietnam from 1945 to 1955 and President from 1945 to 1969. Wikipedia
Born: May 19, 1890, Kim Lien, Vietnam
Died: September 2, 1969, Hanoi, Vietnam
Full name: Nguyễn Sinh Cung
Profession: Politician
Education: Lenin School (1934–1935), MORE
Click on the error
Feedback

Footer Links

\ No newline at end of file diff --git a/people_also_ask/tools.py b/people_also_ask/tools.py new file mode 100755 index 0000000..6616a94 --- /dev/null +++ b/people_also_ask/tools.py @@ -0,0 +1,83 @@ +#! /usr/bin/env python3 +import time +import traceback +from contextlib import ContextDecorator +from typing import Callable, List +from people_also_ask.exceptions import FeaturedSnippetParserError + + +def raise_featuredsnippetparsererror_if_failed(func): + def wrapper(self: "SimpleFeaturedSnippetParser", *args, **kwargs): + try: + return func(self, *args, **kwargs) + except Exception: + traceback.print_exc() + raise FeaturedSnippetParserError(self.text) + return wrapper + + +def retryable(nb_times_retry): + + def decorator(func: Callable): + + def wrapper(*args, **kwargs): + for _ in range(nb_times_retry-1): + try: + return func(*args, **kwargs) + except Exception: + pass + return func(*args, **kwargs) + + return wrapper + return decorator + + +def itemize(lines: List[str]) -> List[str]: + return ["\t- " + line for line in lines] + + +def tabulate(header, table): + length_columns = [] + if header: + table = [header] + table + length_columns = [len(str(e)) for e in header] + for row in table: + current_lengh = [len(str(e)) for e in row] + length_columns = [ + max(i, j) for i, j in zip(length_columns, current_lengh) + ] + tabulated_rows = [] + for row in table: + tabulated_rows.append("\t".join([ + str(e).rjust(length, " ") for e, length in zip(row, length_columns) + ])) + if header: + tabulated_rows.insert( + 1, + "\t".join(["-"*length for length in length_columns]) + ) + return "\n".join(tabulated_rows) + + +def remove_redundant(elements): return list(dict.fromkeys(elements)) + + +class CallingSemaphore(ContextDecorator): + + def __init__(self, nb_call_times_limit, expired_time): + self.nb_call_times_limit = nb_call_times_limit + self.expired_time = expired_time + self.called_timestamps = list() + + def __enter__(self): + while len(self.called_timestamps) > self.nb_call_times_limit: + now = time.time() + self.called_timestamps = list(filter( + lambda x: now - x < self.expired_time, + self.called_timestamps + )) + time.sleep(0.5) + self.called_timestamps.append(time.time()) + + def __exit__(self, *exc): + pass diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..cd2b841 --- /dev/null +++ b/setup.py @@ -0,0 +1,20 @@ +import setuptools + +version = {} + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="people_also_ask", + version="0.0.1", + author="LE Van Tuan", + author_email="leavantuan2312@gmail.com", + packages=setuptools.find_packages(), + long_description=long_description, + install_requires=[ + "beautifulsoup4", + "requests" + ], + python_requires=">=3.6" +)