init

lagranges · Sep 15, 2020 · 1c06c83 · 1c06c83
1 parent f09a9d7
commit 1c06c83
Show file tree

Hide file tree

Showing 20 changed files with 3,047 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*_pycache_*
+*.swp
diff --git a/README.md b/README.md
@@ -1,2 +1,96 @@
-# people_also_ask
-Python wrapper for google people-alos-ask
+# Google-Play-Scraper
+
+[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
+[![PyPI](https://img.shields.io/pypi/v/people_also_ask.svg)](https://pypi.org/project/people-also-ask)
+[![downloads](https://img.shields.io/pypi/dm/people_also_ask.svg)](https://pypistats.org/packages/people-also-ask)
+[![versions](https://img.shields.io/pypi/pyversions/people_also_ask.svg)](https://github.com/lagranges/people_also_ask)
+
+People-also-ask provides APIs to easily crawl the data of google featured snippet.
+
+## ⚠ Warning
+Search engines like Google do not allow any sort of automated access to their service but from a legal point of view there is no known case or broken law. Google does not take legal action against scraping, likely for self-protective reasons.
+API have been configured to not abuse google search engine.
+
+## Installation
+```
+pip install people_also_ask 
+```
+
+## Usage
+Goal of ``people_also_ask`` is to provide simple and easy to use API for retrieving informations from Google Featured Snippet.
+
+### Importing
+```python
+import people_also_ask
+```
+
+### How to get related questions 
+```python
+people_also_ask.get_related_questions("coffee")
+
+['Is coffee good for your health?',
+ 	'Why is coffee bad for you?',
+ 	'Who invented coffee?',
+	'What do u know about coffee?']
+```
+
+### How to get more questions
+```python
+people_also_ask.get_related_questions("coffee", 5)
+
+['How did coffee originate?',
+	'Is coffee good for your health?',
+  'Who brought coffee America?',
+	'Who invented coffee?',
+	'Why is coffee bad for you?',
+	'Why is drinking coffee bad for you?']
+```
+
+### Generate unlimited questions
+```python
+for question in people_also_ask.generate_related_questions("cofee")
+
+Why is coffee bad for you?
+Who invented coffee?
+Is coffee good for your health?
+Who brought coffee America?
+How did coffee originate?
+Why is drinking coffee bad for you?
+....
+```
+
+### Get answer for a question
+```python
+people_also_ask.get_answer("Why is coffee bad for you?")
+
+{'has_answer': True,
+ 'question': 'Why is coffee bad for you?',
+ 'related_questions': ['Why is drinking coffee bad for you?',
+  'Is coffee good for your health?',
+  'Is coffee toxic to your body?',
+  'What does coffee do to your body?'],
+ 'response': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018',
+ 'heading': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018',
+ 'title': 'Coffee — Good or Bad? - Healthline',
+ 'link': 'https://www.healthline.com/nutrition/coffee-good-or-bad#:~:text=Consuming%20too%20much%20caffeine%20can,can%20disrupt%20sleep%20(%2035%20).',
+ 'displayed_link': 'www.healthline.com › nutrition › coffee-good-or-bad',
+ 'snippet_str': 'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018\nwww.healthline.com › nutrition › coffee-good-or-bad\nhttps://www.healthline.com/nutrition/coffee-good-or-bad#:~:text=Consuming%20too%20much%20caffeine%20can,can%20disrupt%20sleep%20(%2035%20).\nCoffee — Good or Bad? - Healthline',
+ 'snippet_data': None,
+ 'date': None,
+ 'snippet_type': 'Definition Featured Snippet',
+ 'snippet_str_body': '',
+ 'raw_text': 'Featured snippet from the web\nConsuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If \nyou\n are sensitive to caffeine and tend to become overstimulated, \n may want to avoid \ncoffee\n altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).\nAug 30, 2018\nCoffee — Good or Bad? - Healthline\nwww.healthline.com\n › nutrition › coffee-good-or-bad'}
+```
+
+### Get Simple Answer for a question
+```python
+people_also_ask.get_simple_answer("Why is coffee bad for you?")
+
+'Consuming too much caffeine can lead to jitteriness, anxiety, heart palpitations and even exacerbated panic attacks (34). If you are sensitive to caffeine and tend to become overstimulated, you may want to avoid coffee altogether. Another unwanted side effect is that it can disrupt sleep ( 35 ).Aug 30, 2018'
+```
+
+
+### Generate questions and answer around a subject
+```python
+people_also_ask.generate_answer("coffee")
+```
diff --git a/people_also_ask/__init__.py b/people_also_ask/__init__.py
@@ -0,0 +1,8 @@
+#! /usr/bin/env python3
+from people_also_ask.google import (
+    get_answer,
+    generate_answer,
+    get_simple_answer,
+    get_related_questions,
+    generate_related_questions,
+)
diff --git a/people_also_ask/exceptions.py b/people_also_ask/exceptions.py
@@ -0,0 +1,67 @@
+#! /usr/bin/env python3
+"""
+Global realted-questions exception and warning classes.
+"""
+
+
+GITHUB_LINK = "Github"
+
+
+class RelatedQuestionError(Exception):
+    """Base Related-Questions exception class."""
+
+    def __init__(self, error):
+        self.error = error
+
+    def __unicode__(self):
+        return (
+            f'An unkown error occured: {self.error}.'
+            f' Please report it on {GITHUB_LINK}.'
+        )
+
+
+class FeaturedSnippetParserError(RelatedQuestionError):
+    """
+    Exception raised when failed to get answer from
+    search result page
+    """
+
+    def __init__(self, text):
+        self.keyword = text
+
+    def __unicode__(self):
+        return (
+            f"Cannot parse result page of '{self.text}'."
+            f" It mays due to a format change of result page."
+            f' Please report it on {GITHUB_LINK}.'
+        )
+
+
+class RelatedQuestionParserError(RelatedQuestionError):
+    """
+    Exception raised when failed to get related questions
+    from search result page
+    """
+
+    def __init__(self, text):
+        self.keyword = text
+
+    def __unicode__(self):
+        return (
+            f"Cannot parse result page of '{self.text}'."
+            f" It mays due to a format change of result page."
+            f' Please report it on {GITHUB_LINK}.'
+        )
+
+
+class GoogleSearchRequestFailedError(RelatedQuestionError):
+    """Exception raised when failed to request search on google"""
+
+    def __init__(self, url, keyword):
+        self.url = url
+        self.keyword = keyword
+
+    def __unicode__(self):
+        return (
+            f"Failed to requests {self.url}/{self.keyword}"
+        )
diff --git a/people_also_ask/google.py b/people_also_ask/google.py
@@ -0,0 +1,183 @@
+#! /usr/bin/env python3
+import os
+import sys
+import time
+import logging
+import requests
+from bs4 import BeautifulSoup
+from typing import List, Dict, Any, Optional, Generator
+
+from people_also_ask.tools import retryable
+from people_also_ask.parser import (
+    extract_related_questions,
+    get_featured_snippet_parser,
+)
+from people_also_ask.exceptions import (
+    GoogleSearchRequestFailedError,
+    RelatedQuestionParserError,
+    FeaturedSnippetParserError
+)
+from people_also_ask.tools import CallingSemaphore
+
+
+URL = "https://www.google.com/search"
+HEADERS = {
+    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
+    " AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/84.0.4147.135 Safari/537.36"
+}
+SESSION = requests.Session()
+NB_TIMES_RETRY = 3
+NB_REQUESTS_LIMIT = os.environ.get(
+    "RELATED_QUESTION_NBREQUESTS_LIMIT", 25
+)
+NB_REQUESTS_DURATION_LIMIT = os.environ.get(
+    "RELATED_QUESTION_DURATION_LIMIT", 60  # seconds
+)
+logging.basicConfig()
+semaphore = CallingSemaphore(
+    NB_REQUESTS_LIMIT, NB_REQUESTS_DURATION_LIMIT
+)
+
+
+@retryable(3)
+def search(keyword: str) -> Optional[BeautifulSoup]:
+    """return html parser of google search result"""
+    params = {"q": keyword}
+    try:
+        with semaphore:
+            time.sleep(0.5)  # be nice with google :)
+            response = SESSION.get(URL, params=params, headers=HEADERS)
+    except Exception:
+        raise GoogleSearchRequestFailedError(URL, keyword)
+    if response.status_code != 200:
+        raise GoogleSearchRequestFailedError(URL, keyword)
+    return BeautifulSoup(response.text, "html.parser")
+
+
+def _get_related_questions(text: str) -> List[str]:
+    """
+    return a list of questions related to text.
+    These questions are from search result of text
+
+    :param str text: text to search
+    """
+    document = search(text)
+    if not document:
+        return []
+    try:
+        return extract_related_questions(document)
+    except Exception:
+        raise RelatedQuestionParserError(text)
+
+
+def generate_related_questions(text: str) -> Generator[str, None, None]:
+    """
+    generate the questions related to text,
+    these quetions are found recursively
+
+    :param str text: text to search
+    """
+    questions = set(_get_related_questions(text))
+    searched_text = set(text)
+    while questions:
+        text = questions.pop()
+        yield text
+        searched_text.add(text)
+        questions |= set(_get_related_questions(text))
+        questions -= searched_text
+
+
+def get_related_questions(text: str, max_nb_questions: Optional[int] = None):
+    """
+    return a number of questions related to text.
+    These questions are found recursively.
+
+    :param str text: text to search
+    """
+    if max_nb_questions is None:
+        return _get_related_questions(text)
+    nb_question_regenerated = 0
+    questions = set()
+    for question in generate_related_questions(text):
+        if nb_question_regenerated > max_nb_questions:
+            break
+        questions.add(question)
+        nb_question_regenerated += 1
+    return list(questions)
+
+
+def get_answer(question: str) -> Dict[str, Any]:
+    """
+    return a dictionary as answer for a question.
+
+    :param str question: asked question
+    """
+    document = search(question)
+    related_questions = extract_related_questions(document)
+    featured_snippet = get_featured_snippet_parser(
+            question, document)
+    if not featured_snippet:
+        res = dict(
+            has_answer=False,
+            question=question,
+            related_questions=related_questions,
+        )
+    else:
+        res = dict(
+            has_answer=True,
+            question=question,
+            related_questions=related_questions,
+        )
+        try:
+            res.update(featured_snippet.to_dict())
+        except Exception:
+            raise FeaturedSnippetParserError(question)
+    return res
+
+
+def generate_answer(text: str) -> Generator[dict, None, None]:
+    """
+    generate answers of questions related to text
+
+    :param str text: text to search
+    """
+    answer = get_answer(text)
+    questions = set(answer["related_questions"])
+    searched_text = set(text)
+    if answer["has_answer"]:
+        yield answer
+    while questions:
+        text = questions.pop()
+        answer = get_answer(text)
+        if answer["has_answer"]:
+            yield answer
+        searched_text.add(text)
+        questions |= set(get_answer(text)["related_questions"])
+        questions -= searched_text
+
+
+def get_simple_answer(question: str, depth: bool = False) -> str:
+    """
+    return a text as summary answer for the question
+
+    :param str question: asked quetion
+    :param bool depth: return the answer of first related question
+        if no answer found for question
+    """
+    document = search(question)
+    featured_snippet = get_featured_snippet_parser(
+            question, document)
+    if featured_snippet:
+        return featured_snippet.response
+    if depth:
+        related_questions = get_related_questions(question)
+        if not related_questions:
+            return ""
+        return get_simple_answer(related_questions[0])
+    return ""
+
+
+if __name__ == "__main__":
+    from pprint import pprint as print
+    print(get_answer(sys.argv[1]))