Skip to content

Commit

Permalink
hackernews support
Browse files Browse the repository at this point in the history
  • Loading branch information
Kaushik Ravichandran committed Nov 16, 2023
1 parent 1fbb74f commit d4540dc
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 0 deletions.
15 changes: 15 additions & 0 deletions evadb/third_party/databases/hackernews/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""hackernews search integration"""
148 changes: 148 additions & 0 deletions evadb/third_party/databases/hackernews/hackernews_handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import github
import pandas as pd
import requests
import json

from evadb.third_party.databases.hackernews.table_column_info import HACKERNEWS_COLUMNS
from evadb.third_party.databases.types import (
DBHandler,
DBHandlerResponse,
DBHandlerStatus,
)


class HackernewsSearchHandler(DBHandler):
connection = lambda x: requests.get("https://www.google.com/").status_code == 200
def __init__(self, name: str, **kwargs):
"""
Initialize the handler.
Args:
name (str): name of the DB handler instance
**kwargs: arbitrary keyword arguments for establishing the connection.
"""
super().__init__(name)
self.query = kwargs.get("query", "")
self.tags = kwargs.get("tags", "")

@property
def supported_table(self):
def _hackernews_topics_generator():
url = "http://hn.algolia.com/api/v1/search?"
url += ("query=" + self.query)
url += ("" if self.tags == "" else "&tags=" + self.tags)
response = requests.get(url)
if (response.status_code != 200):
raise Exception("Could not reach website.")
json_result = response.content
dict_result = json.loads(json_result)
for row in dict_result:
yield {
property_name: row[property_name]
for property_name, _ in HACKERNEWS_COLUMNS
}

mapping = {
"search_results": {
"columns": HACKERNEWS_COLUMNS,
"generator": _hackernews_topics_generator(),
},
}
return mapping

def connect(self):
"""
Set up the connection required by the handler.
Returns:
DBHandlerStatus
"""
return DBHandlerStatus(status=True)

def disconnect(self):
"""
Close any existing connections.
"""
pass

def check_connection(self) -> DBHandlerStatus:
"""
Check connection to the handler.
Returns:
DBHandlerStatus
"""
if self.connection():
return DBHandlerStatus(status=True)
else:
return DBHandlerStatus(status=False, error="Not connected to the internet.")

def get_tables(self) -> DBHandlerResponse:
"""
Return the list of tables in the database.
Returns:
DBHandlerResponse
"""
if not self.connection():
return DBHandlerResponse(data=None, error="Not connected to the internet.")

try:
tables_df = pd.DataFrame(
list(self.supported_table.keys()), columns=["table_name"]
)
return DBHandlerResponse(data=tables_df)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))

def get_columns(self, table_name: str) -> DBHandlerResponse:
"""
Returns the list of columns for the given table.
Args:
table_name (str): name of the table whose columns are to be retrieved.
Returns:
DBHandlerResponse
"""
if not self.connection():
return DBHandlerResponse(data=None, error="Not connected to the internet.")
try:
columns_df = pd.DataFrame(
self.supported_table[table_name]["columns"], columns=["name", "dtype"]
)
return DBHandlerResponse(data=columns_df)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))

def select(self, table_name: str) -> DBHandlerResponse:
"""
Returns a generator that yields the data from the given table.
Args:
table_name (str): name of the table whose data is to be retrieved.
Returns:
DBHandlerResponse
"""
if not self.connection:
return DBHandlerResponse(data=None, error="Not connected to the database.")
try:
if table_name not in self.supported_table:
return DBHandlerResponse(
data=None,
error="{} is not supported or does not exist.".format(table_name),
)

return DBHandlerResponse(
data=None,
data_generator=self.supported_table[table_name]["generator"],
)
except Exception as e:
return DBHandlerResponse(data=None, error=str(e))
24 changes: 24 additions & 0 deletions evadb/third_party/databases/hackernews/table_column_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Autogenerated by ChatGPT from https://github.com/PyGithub/PyGithub/blob/main/github/NamedUser.py
HACKERNEWS_COLUMNS = [
["title", str],
["url", str],
["author", str],
["points", int],
["story_text", str],
["num_comments", int]
]
2 changes: 2 additions & 0 deletions evadb/third_party/databases/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def _get_database_handler(engine: str, **kwargs):
return mod.SnowFlakeDbHandler(engine, **kwargs)
elif engine == "github":
return mod.GithubHandler(engine, **kwargs)
elif engine == "hackernews":
return mod.HackernewsSearchHandler(engine, **kwargs)
elif engine == "slack":
return mod.SlackHandler(engine, **kwargs)
else:
Expand Down
55 changes: 55 additions & 0 deletions test/integration_tests/long/test_hackernews_datasource .py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# coding=utf-8
# Copyright 2018-2023 EvaDB
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from test.util import get_evadb_for_testing

import pytest

from evadb.server.command_handler import execute_query_fetch_all
from evadb.third_party.databases.github.table_column_info import STARGAZERS_COLUMNS


@pytest.mark.notparallel
class HackernewsDataSourceTest(unittest.TestCase):
def setUp(self):
self.evadb = get_evadb_for_testing()
# reset the catalog manager before running each test
self.evadb.catalog().reset()

def tearDown(self):
execute_query_fetch_all(self.evadb, "DROP DATABASE IF EXISTS hackernews_data;")

@pytest.mark.xfail(reason="Flaky testcase due to `bad request` error message")
def test_should_run_select_query_in_github(self):
# Create database.
params = {
"query": "EVADB",
"tags": "story",
}
query = f"""CREATE DATABASE hackernews_data
WITH ENGINE = "hackernews",
PARAMETERS = {params};"""
execute_query_fetch_all(self.evadb, query)

query = "SELECT * FROM hackernews_data.search_results LIMIT 5;"
batch = execute_query_fetch_all(self.evadb, query)
self.assertEqual(len(batch), 10)
expected_column = list(
["search_results.{}".format(col) for col, _ in STARGAZERS_COLUMNS]
)
self.assertEqual(batch.columns, expected_column)

if __name__ == "__main__":
unittest.main()

0 comments on commit d4540dc

Please sign in to comment.