Skip to content

Commit

Permalink
fix: Do not skip the entire doc if a single page is skiped
Browse files Browse the repository at this point in the history
  • Loading branch information
clemlesne committed Jun 10, 2024
1 parent 8a0c4db commit 8902a32
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions function_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,14 +365,14 @@ async def synthesis_to_page(input: BlobClientTrigger) -> None:
page = _clean_page(page)
if not page:
logger.info(f"Page skipped ({blob_name})")
return
continue
# Second, filter-out pages with excessive repetition
if _is_repetition_removal(
text=page,
threshold_ratio=1.5, # We are less strict than the paper because this is all normally internal data and we are not training a model
):
logger.info(f"Repetition detected, skipping ({blob_name})")
return
continue
out_model = PagedDocumentModel(
chunk_content=synthesis_model.chunk_content,
chunk_number=synthesis_model.chunk_number,
Expand Down

0 comments on commit 8902a32

Please sign in to comment.