Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

fix transforming error #501

Merged
merged 17 commits into from
Sep 7, 2023
Merged
25 changes: 17 additions & 8 deletions llama_hub/wordlift/base.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import requests
from bs4 import BeautifulSoup
import re
from typing import List
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
Expand All @@ -10,8 +11,6 @@

DATA_KEY = 'data'
ERRORS_KEY = 'errors'
DEFAULT_PAGE = 0
DEFAULT_ROWS = 500


class WordLiftLoaderError(Exception):
Expand Down Expand Up @@ -132,14 +131,21 @@ def transform_data(self, data: dict) -> List[Document]:
for field in metadata_fields:
field_keys = field.split('.')
value = get_separated_value(row, field_keys)
if value is None:
logging.warning(f"Using default value for {field}")
value = 'n.a'
if isinstance(value, list) and len(value) != 0:
value = value[0]
if is_url(value) and is_valid_html(value):
value = value.replace('\n', '')
extra_info[field] = value
else:
extra_info[field] = clean_value(value)

document = Document(text=text, extra_info=extra_info)
cleaned_value = clean_value(value)
cleaned_value = cleaned_value.replace('\n', '')
extra_info[field] = cleaned_value
text = text.replace('\n', '')
plain_text = re.sub('<.*?>', '', text)
document = Document(text=plain_text, extra_info=extra_info)
documents.append(document)

return documents
Expand Down Expand Up @@ -171,6 +177,9 @@ def alter_query(self):
"""
from graphql import parse, print_ast
from graphql.language.ast import ArgumentNode, NameNode, IntValueNode
DEFAULT_PAGE = 0
DEFAULT_ROWS = 500

query = self.query
page = DEFAULT_PAGE
rows = DEFAULT_ROWS
Expand Down Expand Up @@ -259,18 +268,18 @@ def clean_html(text: str) -> str:
response = requests.get(text)
if response.status_code == 200:
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
soup = BeautifulSoup(html_content, 'lxml')
cleaned_text = soup.get_text()
else:
cleaned_text = ""
elif os.path.isfile(text):
with open(text, 'r') as file:
soup = BeautifulSoup(file, 'html.parser')
soup = BeautifulSoup(file, 'lxml')
cleaned_text = soup.get_text()
else:
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
soup = BeautifulSoup(text, 'html.parser')
soup = BeautifulSoup(text, 'lxml')
cleaned_text = soup.get_text()
return cleaned_text
except (requests.exceptions.RequestException, requests.exceptions.ConnectionError):
Expand Down