diff --git a/Dockerfile b/Dockerfile index f4fe28a..ac788b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,10 +5,22 @@ ENV APP_HOME /app # install Java RUN mkdir -p /usr/share/man/man1 && \ apt-get update -y && \ - apt-get install -y openjdk-17-jre-headless && \ - apt-get install -y libxml2-dev && \ - apt-get install -y libxslt-dev && \ - apt-get install -y build-essential + apt-get install -y openjdk-17-jre-headless +# install essential packages +RUN apt-get install -y \ + libxml2-dev libxslt-dev \ + build-essential libmagic-dev +# install tesseract +RUN apt-get install -y \ + tesseract-ocr \ + lsb-release \ + && echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \ + && apt-get update -oAcquire::AllowInsecureRepositories=true \ + && apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \ + && apt-get update \ + && apt-get install -y \ + tesseract-ocr libtesseract-dev \ + && wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata RUN apt-get install unzip -y && \ apt-get install git -y && \ apt-get autoremove -y @@ -21,4 +33,4 @@ RUN pip install -r requirements.txt RUN python -m nltk.downloader stopwords RUN python -m nltk.downloader punkt RUN chmod +x run.sh -CMD ./run.sh +CMD ./run.sh \ No newline at end of file diff --git a/README.md b/README.md index 5da9772..c134170 100644 --- a/README.md +++ b/README.md @@ -8,15 +8,16 @@ The PDF parser works off text layer and also offers a OCR option (apply_ocr) to Check out the notebook [pdf_visual_ingestor_step_by_step](notebooks/pdf_visual_ingestor_step_by_step.ipynb) to experiment directly with the PDF parser. The PDF Parser offers the following features: - 1. Sections and subsections along with their levels. - 2. Paragraphs - combines lines. - 3. Links between sections and paragraphs. - 5. Tables along with the section the tables are found in. - 6. Lists and nested lists. - 7. Join content spread across pages. - 8. Removal of repeating headers and footers. - 9. Watermark removal. - 10. OCR with boundary boxes + +1. Sections and subsections along with their levels. +2. Paragraphs - combines lines. +3. Links between sections and paragraphs. +5. Tables along with the section the tables are found in. +6. Lists and nested lists. +7. Join content spread across pages. +8. Removal of repeating headers and footers. +9. Watermark removal. +10. OCR with boundary boxes ### HTML A special HTML parser that creates layout aware blocks to make RAG performance better with higher quality chunks. @@ -47,14 +48,22 @@ In some cases, your PDFs may result in errors in the Java server and you will ne python -m nlm_ingestor.ingestion_daemon ``` ### Run the docker file -A docker image is available via github container registry. Before running the following code, you may need to authenticate with docker first -cat ~/TOKEN.txt | docker login https://ghcr.io -u USERNAME --password-stdin -where TOKEN.txt is the token you create as described here: https://docs.github.com/en/enterprise-server@3.7/packages/working-with-a-github-packages-registry/working-with-the-docker-registry +A docker image is available via public github container registry. +Pull the docker image ``` docker pull ghcr.io/nlmatics/nlm-ingestor:latest -docker run nlm-ingestor- ``` +Run the docker image mapping the port 5001 to port of your choice. +``` +docker run -p 5010:5001 ghcr.io/nlmatics/nlm-ingestor:latest- +``` +Once you have the server running, your llmsherpa url will be: +"http://localhost:5010/api/parseDocument?renderFormat=all" +- to apply OCR add &applyOcr=yes +- to use the new indent parser which uses a different alogrithm to assign header levels, add &useNewIndentParser=yes +- this server is good for your development - in production it is recommended to run this behind a secure gateway using nginx or cloud gateways + ### Test the ingestor server Sample test code to test the server with llmsherpa parser is in this [notebook](notebooks/test_llmsherpa_api.ipynb). diff --git a/nlm_ingestor/file_parser/tika_parser.py b/nlm_ingestor/file_parser/tika_parser.py index 2624234..fb10955 100644 --- a/nlm_ingestor/file_parser/tika_parser.py +++ b/nlm_ingestor/file_parser/tika_parser.py @@ -14,7 +14,7 @@ def __init__(self): def parse_to_html(self, filepath, do_ocr=False): # Turn off OCR by default - timeout = 9000 + timeout = 3000 headers = { "X-Tika-OCRskipOcr": "true" } diff --git a/nlm_ingestor/ingestion_daemon/__main__.py b/nlm_ingestor/ingestion_daemon/__main__.py index fc53c3e..e88d67e 100644 --- a/nlm_ingestor/ingestion_daemon/__main__.py +++ b/nlm_ingestor/ingestion_daemon/__main__.py @@ -20,15 +20,17 @@ def parse_document( render_format: str = "all", ): render_format = request.args.get('renderFormat', 'all') - use_new_indent_parser = request.args.get('useNewIndentParser', 'all') + use_new_indent_parser = request.args.get('useNewIndentParser', 'no') + apply_ocr = request.args.get('applyOcr', 'no') file = request.files['file'] tmp_file = None try: parse_options = { "parse_and_render_only": True, "render_format": render_format, - "use_new_indent_parser": use_new_indent_parser, - "parse_pages": () + "use_new_indent_parser": use_new_indent_parser == "yes", + "parse_pages": (), + "apply_ocr": apply_ocr == "yes" } # save the incoming file to a temporary location filename = secure_filename(file.filename) @@ -52,6 +54,7 @@ def parse_document( ) except Exception as e: + print("error uploading file, stacktrace: ", traceback.format_exc()) logger.error( f"error uploading file, stacktrace: {traceback.format_exc()}", exc_info=True, @@ -65,8 +68,7 @@ def parse_document( def main(): logger.info("Starting ingestor service..") - app.run(host="0.0.0.0", port=5001, debug=False) - + app.run(host="0.0.0.0", port=5001, debug=True) if __name__ == "__main__": main() diff --git a/nlm_ingestor/ingestor/ingestor_api.py b/nlm_ingestor/ingestor/ingestor_api.py index ef9d14c..ed7e153 100644 --- a/nlm_ingestor/ingestor/ingestor_api.py +++ b/nlm_ingestor/ingestor/ingestor_api.py @@ -33,6 +33,7 @@ def ingest_document( logger.info(f"Parsing {mime_type} at {doc_location} with name {doc_name}") if mime_type == "application/pdf": logger.info("using pdf parser") + print("testing..", parse_options) pdfi = pdf_ingestor.PDFIngestor(doc_location, parse_options) return_dict = pdfi.return_dict elif mime_type in {"text/markdown", "text/x-markdown"}: diff --git a/nlm_ingestor/ingestor/pdf_ingestor.py b/nlm_ingestor/ingestor/pdf_ingestor.py index aade590..755bd55 100644 --- a/nlm_ingestor/ingestor/pdf_ingestor.py +++ b/nlm_ingestor/ingestor/pdf_ingestor.py @@ -7,12 +7,9 @@ from bs4 import BeautifulSoup -from . import table_builder from nlm_ingestor.file_parser import pdf_file_parser from timeit import default_timer -from .visual_ingestor import table_parser from .visual_ingestor import visual_ingestor -from nlm_ingestor.ingestor.visual_ingestor import block_renderer from nlm_ingestor.ingestor.visual_ingestor.new_indent_parser import NewIndentParser from nlm_ingestor.ingestor_utils.utils import NpEncoder, \ detect_block_center_aligned, detect_block_center_of_page diff --git a/nlm_ingestor/ingestor_utils/info_extractor.py b/nlm_ingestor/ingestor_utils/info_extractor.py deleted file mode 100644 index 83ae82f..0000000 --- a/nlm_ingestor/ingestor_utils/info_extractor.py +++ /dev/null @@ -1,276 +0,0 @@ -import os -from collections import Counter - -from nlm_utils.model_client.classification import ClassificationClient - -from nlm_ingestor.ingestor import line_parser -from nlm_ingestor.ingestor.visual_ingestor import table_parser -import copy - -from nlm_utils.utils import ensure_bool -from nlm_utils.utils import query_preprocessing as preprocess - -use_qatype = ensure_bool(os.getenv("USE_QATYPE", False)) or ensure_bool(os.getenv("INDEX_QATYPE", False)) - - -def create_all_definition_links(kv_pairs, all_quoted_words): - all_definitions = {} - for kv in kv_pairs: - if kv["key"] not in all_definitions: - all_definitions[kv["key"]] = [] - all_definitions[kv["key"]].append({ - "block_idx": kv["block"]["block_idx"], - "block_text": kv["block"]["block_text"], - }) - - for qw in all_quoted_words: - quote_block_texts = [ - {"text": qw_context.get('block_text', ''), "block_idx": qw_context["block_idx"]} - for qw_context in all_quoted_words[qw] - ] - definition_contexts = preprocess.identify_non_reporting_expression(qw, quote_block_texts) - if qw not in all_definitions: - all_definitions[qw] = [] - for def_context in definition_contexts: - is_already_added = False - for def_cont in all_definitions[qw]: - if def_cont["block_idx"] == def_context["block_idx"]: - is_already_added = True - break - if not is_already_added: - all_definitions[qw].append({ - "block_idx": def_context["block_idx"], - "block_text": def_context["text"], - }) - return all_definitions - - -def extract_key_data( - texts, - infos, - bbox={}, - add_info=False, - do_summaries=True, -): - qa_client = None - if use_qatype: - qa_client = ClassificationClient( - model="roberta", - task="roberta-phraseqa", - url=os.getenv("MODEL_SERVER_URL", "https://services.nlmatics.com"), - ) - noun_chunk_locs = {} - # all noun chunks in the document - noun_chunks = [] - # summary by header - summary_by_header = {} - # mapping from noun_chunk to headers - noun_chunk_headers = {} - # all the contexts that have quoted words e.g. definitions - kv_pairs = [] - qw_queries = [] - qw_contexts = [] - all_quoted_words = {} - - def get_summary_key(info): - # info["header_text"] - if info: - return info["block_text"] + "-" + str(info["block_idx"]) - else: - return "" - - for match_idx, (text, info) in enumerate(zip(texts, infos)): - if "ignore" not in info or not info["ignore"]: - if do_summaries and info["block_type"] == "header": - summary_by_header[get_summary_key(info)] = { - "title": info["block_text"], - "block": copy.deepcopy(info), - "block_idx": info["block_idx"], - "match_idx": match_idx, - "noun_chunks": [], - "n_quoted_words": 0, - "kv_pairs": [], - "tables": [], - "table_bbox": [], - "audited": False, - } - - if info["block_idx"] in bbox: - summary_by_header[get_summary_key(info)]["header_bbox"] = bbox[ - info["block_idx"] - ]["bbox"] - - if ( - info["block_type"] != "table_row" - and table_parser.row_group_key not in info - ): - line = line_parser.Line(text) - quoted_words = [] - for qw in line.quoted_words: - stop_word = True - # Remove any stop words. - for word in qw.split(): - if word not in preprocess.CROSS_REFERENCE_STOP_WORDS: - stop_word = False - break - if not stop_word and len(qw) >= 2: - quoted_words.append(qw) - if len(quoted_words) > 0: - for qw in quoted_words: - kv_data = { - "block": info, # so that we can link in UI - "all_quoted_words": quoted_words, # to know all other quoted words in query - "key": qw, # key of the k, v pairs we are extracting - } - if add_info: - kv_data["match_text"] = text - kv_pairs.append( - kv_data, - ) - if qw not in all_quoted_words: - all_quoted_words[qw] = [ - info, - ] - else: - if info not in all_quoted_words[qw]: - all_quoted_words[qw].append(info) - qw_queries.append(qw) - qw_contexts.append(text) - noun_chunks.extend(line.noun_chunks) - - for chunk in line.noun_chunks: - if chunk not in noun_chunk_locs: - noun_chunk_locs[chunk] = [match_idx] - else: - noun_chunk_locs[chunk].append(match_idx) - header_text = info["header_text"] - if not header_text == "" and not info["block_type"] == "header": - header_block_info = { - "block_idx": info["header_block_idx"], - "block_text": info["header_text"], - } - key = get_summary_key(header_block_info) - if key not in summary_by_header: - continue - summary_by_header[key]["noun_chunks"].append(chunk) - chunk_key = chunk - if chunk_key not in noun_chunk_headers: - noun_chunk_headers[chunk_key] = [header_text] - else: - noun_chunk_headers[chunk_key].append(header_text) - - # todo use yi's dataframe code here - table = {} - header_block_info = None - is_rendering_table = False - if do_summaries: - for block in infos: - if "is_table_start" in block: - is_rendering_table = True - header_block_info = { - "block_idx": block["header_block_idx"], - "block_text": block["header_text"], - } - if block["block_idx"] in bbox: - table_bbox = bbox[block["block_idx"]]["bbox"] - audited = bbox[block["block_idx"]]["audited"] - else: - table_bbox = [-1, -1, -1, -1] - audited = False - table = {"rows": [], "cols": [], "name": header_block_info["block_text"]} - if is_rendering_table: - cell_values = block.get("cell_values", []) - if "is_header" in block: - table["cols"] = cell_values - elif "is_header_group" not in block: - table["rows"].append(cell_values) - # If we are rendering a table, do not consider headers - if block["block_type"] == "header" and \ - "is_table_start" not in block and \ - get_summary_key(block) in summary_by_header: - del summary_by_header[get_summary_key(block)] - if "is_table_end" in block and header_block_info: - summary_key = get_summary_key(header_block_info) - if summary_key and summary_key not in summary_by_header: - summary_by_header[summary_key] = { - "title": header_block_info["block_text"], - "block": copy.deepcopy(block), - "block_idx": header_block_info["block_idx"], - "match_idx": match_idx, - "header_bbox": [-1, -1, -1, -1], - "noun_chunks": [], - "n_quoted_words": 0, - "kv_pairs": [], - "tables": [], - "table_bbox": [], - "audited": False, - } - else: - summary_by_header[summary_key]["block"]["table_page_idx"] = block["page_idx"] - - summary_by_header[summary_key]["tables"].append(table) - summary_by_header[summary_key]["table_bbox"].append(table_bbox) - summary_by_header[summary_key]["audited"] = audited - - is_rendering_table = False - - if qw_queries and qa_client: - qw_answers = qa_client(qw_queries, qw_contexts)["answers"] - for qw_info, (_, qw_answer) in zip(kv_pairs, qw_answers[0].items()): - qw_info["value"] = qw_answer["text"] - # now merge the result from individual kv pairs into header wise summary - # filtered_kv_pairs = [] - if qa_client: - kv_pairs = [ - item for item in kv_pairs if item.get("value", "") - ] # Select entries with non-null value - else: - kv_pairs = [] - if do_summaries: - for pair in kv_pairs: - header_key = get_summary_key(pair["block"]) - if header_key not in summary_by_header: - continue - kv = {"key": pair["key"], "value": pair["value"]} - summary = summary_by_header[header_key] - summary["block"]["n_quoted_words"] = len(pair["all_quoted_words"]) - summary["kv_pairs"].append(kv) - # turn the map into a list - summaries = [] - for header_text, summary in summary_by_header.items(): - # summary["title"] = header_text - counter = Counter(summary["noun_chunks"]) - top_chunks = counter.most_common(8) - summary["noun_chunks"] = [] - for chunk in top_chunks: - summary["noun_chunks"].append(chunk[0]) - - # detach tables from headers, so the paras won't indent under the table - if summary["block"]["block_type"] == "header" and len(summary["tables"]) > 0: - # # duplicate the block summary for header and table - # header_block = copy.deepcopy(summary) - # header_block["tables"] = [] - # header_block["table_bbox"] = [] - # summaries.append(header_block) - # - # # offset the table block summary by 1 - # summary["block_idx"] += 1 - # summary["match_idx"] += 1 - # summary["block"]["block_idx"] += 1 - # summary["block"]["header_block_idx"] += 1 - # summary["block"]["header_match_idx"] += 1 - # summary["block"]["level"] += 1 - - # insert header text to the beginning (nearest) header chain, so that outline in the UI re-maps it - summary["block"]["level_chain"].insert(0, { - "block_idx": summary["block_idx"], - "block_text": summary["block"]["block_text"], - }) - summaries.append(summary) - # sort the summaries by order of appearance - summaries.sort(key=lambda x: x["match_idx"]) - - # Construct the reference definitions to enable linking. - reference_definitions = create_all_definition_links(kv_pairs, all_quoted_words) - - return summaries, kv_pairs, reference_definitions diff --git a/notebooks/pdf_visual_ingestor_step_by_step.ipynb b/notebooks/pdf_visual_ingestor_step_by_step.ipynb index 4058474..a95b178 100644 --- a/notebooks/pdf_visual_ingestor_step_by_step.ipynb +++ b/notebooks/pdf_visual_ingestor_step_by_step.ipynb @@ -95,15 +95,24 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 32, "id": "57d655f9-af7f-4c04-a016-b1a9dc1882f0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ocr\n" + ] + } + ], "source": [ "doc_loc = '/Users/ambikasukla/projects/data/sample-8k.pdf'\n", + "doc_loc = '/Users/ambikasukla/Downloads/scansmpl.pdf'\n", "\n", "# by default we will turn off ocr as it is slow, use true here to parse ocr files\n", - "needs_ocr = False\n", + "needs_ocr = True\n", "timeout = 3000\n", "if not needs_ocr:\n", " headers = {\n", @@ -113,9 +122,9 @@ "else:\n", " print(\"ocr\")\n", " headers = {\n", - " \"X-Tika-OCRskipOcr\": \"true\",\n", + " \"X-Tika-OCRskipOcr\": \"false\",\n", " \"X-Tika-OCRoutputType\": \"hocr\",\n", - "# \"X-Tika-OCRocrEngineMode\": \"3\",\n", + " \"X-Tika-OCRocrEngineMode\": \"3\",\n", " \"X-Tika-PDFExtractInlineImages\":\"false\",\n", " \"X-Tika-Timeout-Millis\": str(100*timeout),\n", " \"X-Tika-OCRtimeoutSeconds\": str(timeout),\n", @@ -133,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 34, "id": "bca9a811-b789-4989-908d-78462ace4004", "metadata": {}, "outputs": [ @@ -142,951 +151,47 @@ "text/html": [ "\n", "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "\n", "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", "\n", - "\n", - "\n", + "\n", "\n", "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", + "\n", "\n", - "\n", - "\n", - "\n", - "\n", - "\n", + "\n", + "\n", + "\n", "\n", "\n", - "\n", + "\n", "\n", "\n", "\n", "\n", "\n", "\n", - "\n", - "\n", - "Form 8-K for Microsoft Corp filed 10/16/2023\n", + "\n", + "Sample Scanned Image\n", "\n", - "
\n", - "\n", - "\n", - "

UNITED STATES

\n", - "\n", - "

SECURITIES AND EXCHANGE COMMISSION

\n", - "\n", - "

Washington, D.C. 20549

\n", - "\n", - "\n", - "

FORM 8-K

\n", - "\n", - "\n", - "

CURRENT REPORT

\n", - "\n", - "

Pursuant to Section 13 or 15(d) of the Securities Exchange Act of 1934

\n", - "\n", - "\n", - "

Date of Report (date of earliest event reported): October 16, 2023

\n", - "\n", - "\n", - "

MICROSOFT CORPORATION

\n", - "\n", - "

(Exact name of registrant as specified in its charter)

\n", - "\n", - "\n", - "

Washington

\n", - "

001-37845

\n", - "

91-1144442

\n", - "\n", - "

(State or other jurisdiction of

\n", - "\n", - "

incorporation)

\n", - "\n", - "

(Commission File Number)

\n", - "

(IRS Employer Identification No.)

\n", - "\n", - "\n", - "

One Microsoft Way

\n", - " \n", - "

Redmond, Washington

\n", - "

98052-6399

\n", - "\n", - "

(Address of principal executive offices)

\n", - "

(Zip Code)

\n", - "\n", - "\n", - "

Registrant's telephone number, including area code: (425) 882-8080

\n", - "\n", - "\n", - "

(Former name or former address, if changed since last report)

\n", - "\n", - "\n", - "

Check the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the

\n", - "\n", - "

following provisions (see General Instruction A.2):

\n", - "\n", - " \n", - "

\n", - "

Written communications pursuant to Rule 425 under the Securities Act (17 CFR 230.425)

\n", - "\n", - " \n", - "

\n", - "

Soliciting material pursuant to Rule 14a-12 under the Exchange Act (17 CFR 240.14a-12)

\n", - "\n", - " \n", - "

\n", - "

Pre-commencement communications pursuant to Rule 14d-2(b) under the Exchange Act (17 CFR 240.14d-2(b))

\n", - "\n", - " \n", - "

\n", - "

Pre-commencement communications pursuant to Rule 13e-4(c) under the Exchange Act (17 CFR 240.13e-4(c))

\n", - "\n", - "\n", - "

Securities registered pursuant to Section 12(b) of the Act:

\n", - "\n", - "\n", - "

Title of each class

\n", - "

Trading Symbol(s)

\n", - "

Name of each exchange on which

\n", - "\n", - "

registered

\n", - "\n", - "

Common stock, $0.00000625 par value per share

\n", - "

MSFT

\n", - "

NASDAQ

\n", - "\n", - "

3.125% Notes due 2028

\n", - "

MSFT

\n", - "

NASDAQ

\n", - "\n", - "

2.625% Notes due 2033

\n", - "

MSFT

\n", - "

NASDAQ

\n", - "\n", - "\n", - "

Indicate by check mark whether the registrant is an emerging growth company as defined in Rule 405 of the Securities Act of 1933 (§230.405 of this

\n", - "\n", - "

chapter) or Rule 12b-2 of the Securities Exchange Act of 1934 (§240.12b-2 of this chapter).

\n", - "\n", - "\n", - "

Emerging growth company ☐

\n", - "\n", - "\n", - "

If an emerging growth company, indicate by check mark if the registrant has elected not to use the extended transition period for complying with any new

\n", - "\n", - "

or revised financial accounting standards provided pursuant to Section 13(a) of the Exchange Act. ☐

\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "

Item 8.01

\n", - "

Other Events.

\n", - "\n", - "\n", - "

Exchange Offers and Consent Solicitations

\n", - "\n", - "\n", - "

On October 16, 2023, Microsoft Corporation (“Microsoft”) announced that, in connection with the previously announced merger of Activision

\n", - "\n", - "

Blizzard, Inc. (“Activision Blizzard”) with and into a wholly-owned subsidiary of Microsoft (the “Merger”), with Activision Blizzard surviving the Merger

\n", - "\n", - "

as a wholly-owned subsidiary of Microsoft, Microsoft has commenced offers to exchange (each, an “Exchange Offer” and, collectively, the “Exchange

\n", - "\n", - "

Offers”) any and all outstanding notes issued by Activision Blizzard (the “Existing Activision Blizzard Notes”) for (1) up to $3,650,000,000 aggregate

\n", - "\n", - "

principal amount of new notes to be issued by Microsoft (the “New Microsoft Notes”) and (2) cash. Concurrently with the Exchange Offers being made by

\n", - "\n", - "

Microsoft,

\n", - "

Activision

\n", - "

Blizzard

\n", - "

is,

\n", - "

upon

\n", - "

Microsoft’s

\n", - "

request,

\n", - "

soliciting

\n", - "

consents

\n", - "

(each,

\n", - "

a

\n", - "

“Consent

\n", - "

Solicitation”

\n", - "

and,

\n", - "

collectively,

\n", - "

the

\n", - "

“Consent

\n", - "\n", - "

Solicitations”)

\n", - "

to

\n", - "

adopt

\n", - "

certain

\n", - "

proposed

\n", - "

amendments

\n", - "

to

\n", - "

each

\n", - "

of

\n", - "

the

\n", - "

corresponding

\n", - "

indentures

\n", - "

governing

\n", - "

the

\n", - "

Existing

\n", - "

Activision

\n", - "

Blizzard

\n", - "

Notes

\n", - "

to

\n", - "\n", - "

eliminate certain of the covenants, restrictive provisions and events of default from such indentures.

\n", - "\n", - "\n", - "

Each

\n", - "

Exchange

\n", - "

Offer

\n", - "

and

\n", - "

Consent

\n", - "

Solicitation

\n", - "

is

\n", - "

conditioned

\n", - "

upon

\n", - "

the

\n", - "

completion

\n", - "

of

\n", - "

the

\n", - "

other

\n", - "

Exchange

\n", - "

Offers

\n", - "

and

\n", - "

Consent

\n", - "

Solicitations,

\n", - "\n", - "

although Microsoft may waive such condition at any time with respect to an Exchange Offer.

\n", - "\n", - "\n", - "

The New Microsoft Notes have not been registered with the Securities and Exchange Commission (the “SEC”) under the Securities Act of 1933,

\n", - "\n", - "

as amended (the “Securities Act”), or any state or foreign securities laws. Therefore, the New Microsoft Notes may not be offered or sold in the United

\n", - "\n", - "

States

\n", - "

or

\n", - "

to

\n", - "

any

\n", - "

U.S.

\n", - "

person

\n", - "

absent

\n", - "

registration,

\n", - "

except

\n", - "

pursuant

\n", - "

to

\n", - "

an

\n", - "

applicable

\n", - "

exemption

\n", - "

from,

\n", - "

or

\n", - "

in

\n", - "

a

\n", - "

transaction

\n", - "

not

\n", - "

subject

\n", - "

to,

\n", - "

the

\n", - "

registration

\n", - "\n", - "

requirements of the Securities Act.

\n", - "\n", - "\n", - "

The

\n", - "

Exchange

\n", - "

Offers

\n", - "

and

\n", - "

Consent

\n", - "

Solicitations

\n", - "

are

\n", - "

being

\n", - "

made

\n", - "

pursuant

\n", - "

to

\n", - "

the

\n", - "

terms

\n", - "

and

\n", - "

subject

\n", - "

to

\n", - "

the

\n", - "

conditions

\n", - "

set

\n", - "

forth

\n", - "

in

\n", - "

the

\n", - "

offering

\n", - "\n", - "

memorandum and consent solicitation statement dated as of October 16, 2023 (as it may be amended or supplemented, the “Offering Memorandum and

\n", - "\n", - "

Consent

\n", - "

Solicitation

\n", - "

Statement”).

\n", - "

A

\n", - "

copy

\n", - "

of

\n", - "

the

\n", - "

press

\n", - "

release

\n", - "

announcing

\n", - "

the

\n", - "

Exchange

\n", - "

Offers

\n", - "

and

\n", - "

the

\n", - "

Consent

\n", - "

Solicitations

\n", - "

is

\n", - "

furnished

\n", - "

herewith

\n", - "

as

\n", - "\n", - "

Exhibit 99.1 and is incorporated herein by reference.

\n", - "\n", - "\n", - "

The information in this Item 8.01, as well as Exhibit 99.1 attached hereto, is being furnished, not filed. Accordingly, such information will not be

\n", - "\n", - "

incorporated by reference into any registration statement filed by Microsoft under the Securities Act, unless specifically identified as being incorporated by

\n", - "\n", - "

reference therein.

\n", - "\n", - "\n", - "

Cautionary Statement Regarding Forward-Looking Information

\n", - "\n", - "\n", - "

This communication includes forward-looking statements within the meaning of federal securities laws. All statements, other than statements of

\n", - "\n", - "

historical fact, included in this communication are forward-looking statements. Such forward-looking statements include, but are not limited to, statements

\n", - "\n", - "

about

\n", - "

the

\n", - "

timing

\n", - "

of

\n", - "

the

\n", - "

Exchange

\n", - "

Offers

\n", - "

and

\n", - "

Consent

\n", - "

Solicitations.

\n", - "

No

\n", - "

assurances

\n", - "

can

\n", - "

be

\n", - "

given

\n", - "

that

\n", - "

the

\n", - "

forward-looking

\n", - "

statements

\n", - "

contained

\n", - "

in

\n", - "

this

\n", - "\n", - "

communication will occur as expected and actual results may differ materially from those included in this communication. Forward-looking statements are

\n", - "\n", - "

based on current expectations and assumptions that involve a number of risks and uncertainties that could cause actual results to differ materially from

\n", - "\n", - "

those included in this communication. Important risks, uncertainties and other factors are described in the Offering Memorandum and Consent Solicitation

\n", - "\n", - "

Statement, Microsoft’s Annual Report on Form 10-K for the fiscal year ended June 30, 2023, Current Reports on Form 8-K and other filings Microsoft

\n", - "\n", - "

makes with the SEC and in Activision Blizzard’s Annual Report on Form 10-K for the year ended December 31, 2022, Activision Blizzard’s Quarterly

\n", - "\n", - "

Reports

\n", - "

on

\n", - "

Form

\n", - "

10-Q

\n", - "

for

\n", - "

the

\n", - "

quarterly

\n", - "

periods

\n", - "

ended

\n", - "

March

\n", - "

31,

\n", - "

2023

\n", - "

and

\n", - "

June

\n", - "

30,

\n", - "

2023,

\n", - "

Current

\n", - "

Reports

\n", - "

on

\n", - "

Form

\n", - "

8-K

\n", - "

and

\n", - "

other

\n", - "

filings

\n", - "

Activision

\n", - "\n", - "

Blizzard makes with the SEC. Forward-looking statements are based on the estimates and opinions of management at the time the statements are made.

\n", - "\n", - "

Except

\n", - "

to

\n", - "

the

\n", - "

extent

\n", - "

required

\n", - "

by

\n", - "

applicable

\n", - "

law,

\n", - "

neither

\n", - "

Microsoft

\n", - "

nor

\n", - "

Activision

\n", - "

Blizzard

\n", - "

undertakes

\n", - "

any

\n", - "

obligation

\n", - "

to

\n", - "

publicly

\n", - "

update

\n", - "

or

\n", - "

revise

\n", - "

any

\n", - "\n", - "

forward-looking

\n", - "

statement,

\n", - "

whether

\n", - "

as

\n", - "

a

\n", - "

result

\n", - "

of

\n", - "

new

\n", - "

information,

\n", - "

future

\n", - "

events

\n", - "

or

\n", - "

otherwise.

\n", - "

You

\n", - "

are

\n", - "

cautioned

\n", - "

not

\n", - "

to

\n", - "

place

\n", - "

undue

\n", - "

reliance

\n", - "

on

\n", - "

these

\n", - "\n", - "

forward-looking statements that speak only as of the date hereof.

\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "

No Offer or Solicitation

\n", - "\n", - "\n", - "

This communication is not intended to and shall not constitute an offer to sell or purchase, or a solicitation of an offer to sell or purchase, or the

\n", - "\n", - "

solicitation of tenders or consents with respect to, any security. No offer, solicitation, purchase or sale will be made in any jurisdiction in which such an

\n", - "\n", - "

offer,

\n", - "

solicitation,

\n", - "

or

\n", - "

sale

\n", - "

would

\n", - "

be

\n", - "

unlawful.

\n", - "

The

\n", - "

Exchange

\n", - "

Offers

\n", - "

and

\n", - "

Consent

\n", - "

Solicitations

\n", - "

are

\n", - "

being

\n", - "

made

\n", - "

to

\n", - "

eligible

\n", - "

holders

\n", - "

solely

\n", - "

pursuant

\n", - "

to

\n", - "

the

\n", - "\n", - "

Offering Memorandum and Consent Solicitation Statement and only to such persons and in such jurisdictions as is permitted under applicable law.

\n", - "\n", - "\n", - "

Item 9.01

\n", - "

Financial Statements and Exhibits.

\n", - "\n", - "\n", - "

(d)

\n", - "

Exhibits.

\n", - "\n", - "\n", - "

Exhibit

\n", - "\n", - "

No.

\n", - "\n", - "\n", - "

Description

\n", - "\n", - "

99.1

\n", - "

Joint Press Release, dated October 16, 2023

\n", - "\n", - "

104

\n", - "

Cover Page Interactive Data File (formatted as Inline XBRL and contained in Exhibit 101)

\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "

SIGNATURE

\n", - "\n", - "\n", - "

Pursuant

\n", - "

to

\n", - "

the

\n", - "

requirements

\n", - "

of

\n", - "

the

\n", - "

Securities

\n", - "

Exchange

\n", - "

Act

\n", - "

of

\n", - "

1934,

\n", - "

the

\n", - "

Registrant

\n", - "

has

\n", - "

duly

\n", - "

caused

\n", - "

this

\n", - "

report

\n", - "

to

\n", - "

be

\n", - "

signed

\n", - "

on

\n", - "

its

\n", - "

behalf

\n", - "

by

\n", - "

the

\n", - "\n", - "

undersigned thereunto duly authorized.

\n", - "\n", - "\n", - "

MICROSOFT CORPORATION

\n", - "\n", - " \n", - "

By:

\n", - "

/s/ Keith R. Dolliver

\n", - "\n", - "

Keith R. Dolliver

\n", - "\n", - "

Corporate Secretary

\n", - "\n", - " \n", - "

Date: October 16, 2023

\n", - " \n", - "\n", - "\n", - "
\n", - "
\n", - "

Exhibit 99.1

\n", - "\n", - "\n", - "

Microsoft Commences Private Exchange Offers and Activision Blizzard Commences Consent Solicitations

\n", - "\n", - "\n", - "

REDMOND, Wash. and SANTA MONICA, Calif. – October 16, 2023 – Microsoft Corporation (Nasdaq: MSFT) (“Microsoft”) and Activision

\n", - "\n", - "

Blizzard, Inc. (Nasdaq: ATVI) (“Activision Blizzard”) today announced that, in connection with the previously announced merger of Activision Blizzard

\n", - "\n", - "

with and into a wholly owned subsidiary of Microsoft (the “Merger”), with Activision Blizzard surviving the Merger as a wholly owned subsidiary of

\n", - "\n", - "

Microsoft, Microsoft has commenced offers to Eligible Holders (as defined herein) to exchange (each an “Exchange Offer” and collectively, the “Exchange

\n", - "\n", - "

Offers”) any and all outstanding notes issued by Activision Blizzard as set forth in the table below (the “Existing Activision Blizzard Notes”) for (1) up to

\n", - "\n", - "

$3,650,000,000 aggregate principal amount of new notes issued by Microsoft (the “New Microsoft Notes”) and (2) cash.

\n", - "\n", - "\n", - "

The following table sets forth the Exchange Consideration and Total Exchange Consideration for each series of Existing Activision Blizzard

\n", - "\n", - "

Notes:

\n", - "\n", - "\n", - "

Title of

\n", - "\n", - "

Series

\n", - " \n", - "

CUSIP

\n", - "\n", - "

Number

\n", - "

ISIN

\n", - "

Maturity Date

\n", - " \n", - "

Aggregate

\n", - "\n", - "

Principal

\n", - "\n", - "

Amount

\n", - "\n", - "

Outstanding

\n", - " \n", - "

Exchange

\n", - "\n", - "

Consideration(1)

\n", - " \n", - "

Total Exchange

\n", - "\n", - "

Consideration(2)

\n", - "\n", - "

3.400% Senior Notes due

\n", - "\n", - "

2026

\n", - "\n", - "

00507VAK5

\n", - "

US00507VAK52

\n", - "

September 15, 2026

\n", - "

$850,000,000

\n", - "

$970 principal amount of

\n", - "\n", - "

New Microsoft 3.400%

\n", - "\n", - "

Notes due 2026

\n", - "\n", - "

$1,000 principal amount of

\n", - "\n", - "

New Microsoft 3.400%

\n", - "\n", - "

Notes due 2026 and $1.00

\n", - "\n", - "

in cash

\n", - "\n", - "

3.400% Senior Notes due

\n", - "\n", - "

2027

\n", - "\n", - "

00507VAM1

\n", - "

US00507VAM19

\n", - "

June 15, 2027

\n", - "

$400,000,000

\n", - "

$970 principal amount of

\n", - "\n", - "

New Microsoft 3.400%

\n", - "\n", - "

Notes due 2027

\n", - "\n", - "

$1,000 principal amount of

\n", - "\n", - "

New Microsoft 3.400%

\n", - "\n", - "

Notes due 2027 and $1.00

\n", - "\n", - "

in cash

\n", - "\n", - "

1.350% Senior Notes due

\n", - "\n", - "

2030

\n", - "\n", - "

00507VAP4

\n", - "

US00507VAP40

\n", - "

September 15, 2030

\n", - "

$500,000,000

\n", - "

$970 principal amount of

\n", - "\n", - "

New Microsoft 1.350%

\n", - "\n", - "

Notes due 2030

\n", - "\n", - "

$1,000 principal amount of

\n", - "\n", - "

New Microsoft 1.350%

\n", - "\n", - "

Notes due 2030 and $1.00

\n", - "\n", - "

in cash

\n", - "\n", - "

4.500% Senior Notes due

\n", - "\n", - "

2047

\n", - "\n", - "

00507VAN9

\n", - "

US00507VAN91

\n", - "

June 15, 2047

\n", - "

$400,000,000

\n", - "

$970 principal amount of

\n", - "\n", - "

New Microsoft 4.500%

\n", - "\n", - "

Notes due 2047

\n", - "\n", - "

$1,000 principal amount of

\n", - "\n", - "

New Microsoft 4.500%

\n", - "\n", - "

Notes due 2047 and $1.00

\n", - "\n", - "

in cash

\n", - "\n", - "

2.500% Senior Notes due

\n", - "\n", - "

2050

\n", - "\n", - "

00507VAQ2

\n", - "

US00507VAQ23

\n", - "

September 15, 2050

\n", - "

$1,500,000,000

\n", - "

$970 principal amount of

\n", - "\n", - "

New Microsoft 2.500%

\n", - "\n", - "

Notes due 2050

\n", - "\n", - "

$1,000 principal amount of

\n", - "\n", - "

New Microsoft 2.500%

\n", - "\n", - "

Notes due 2050 and $1.00

\n", - "\n", - "

in cash

\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "\n", - "\n", - "

(1)

\n", - "

For each $1,000 principal

\n", - "

amount

\n", - "

of

\n", - "

Existing

\n", - "

Activision

\n", - "

Blizzard

\n", - "

Notes

\n", - "

validly

\n", - "

tendered

\n", - "

after

\n", - "

the

\n", - "

Early

\n", - "

Tender

\n", - "

Date

\n", - "

(as

\n", - "

defined

\n", - "

herein)

\n", - "

but

\n", - "

at

\n", - "

or

\n", - "\n", - "

before the Expiration Date (as defined herein), not validly withdrawn and accepted for exchange.

\n", - "\n", - "

(2)

\n", - "

For each $1,000 principal amount of Existing Activision Blizzard Notes validly tendered at or before the Early Tender Date, not validly withdrawn and

\n", - "\n", - "

accepted for exchange.

\n", - "\n", - "\n", - "

Concurrently with the Exchange Offers being made by Microsoft, Activision Blizzard is, upon Microsoft’s request, soliciting consents from

\n", - "\n", - "

Eligible Holders (each, a “Consent Solicitation” and, collectively, the “Consent Solicitations”) to adopt certain proposed amendments to each of the

\n", - "\n", - "

corresponding indentures governing the Existing Activision Blizzard Notes to eliminate certain of the covenants, restrictive provisions and events of

\n", - "\n", - "

default from such indentures (with respect to the corresponding indenture for such Existing Activision Blizzard Notes, the “Proposed Amendments”).

\n", - "\n", - "

Eligible Holders may deliver their consent to the Proposed Amendments only by tendering Existing Activision Blizzard Notes of the applicable series in

\n", - "\n", - "

the Exchange Offers and Consent Solicitations. Eligible Holders may not deliver a consent in a Consent Solicitation without tendering Existing Activision

\n", - "\n", - "

Blizzard Notes in the applicable Exchange Offer and Eligible Holders may not tender Existing Activision Blizzard Notes without also having been deemed

\n", - "\n", - "

to deliver a consent.

\n", - "\n", - "\n", - "

The Exchange Offers and Consent Solicitations are being made pursuant to the terms and subject to the conditions set forth in the offering

\n", - "\n", - "

memorandum and consent solicitation statement dated as of October 16, 2023 (as it may be amended or supplemented, the “Offering Memorandum and

\n", - "\n", - "

Consent Solicitation Statement”). Microsoft, in its sole discretion, may terminate, withdraw, amend or extend any of the Exchange Offers, subject to the

\n", - "\n", - "

terms and conditions set forth in the Offering Memorandum and Consent Solicitation Statement. Any such termination, withdrawal, amendment or

\n", - "\n", - "

extension by Microsoft will automatically terminate, withdraw, amend or extend the corresponding Consent Solicitation, as applicable.

\n", - "\n", - "\n", - "

2

\n", - "
\n", - "
\n", - "\n", - "

In addition, each Exchange Offer and Consent Solicitation is conditioned upon the completion of the other Exchange Offers and Consent

\n", - "\n", - "

Solicitations, although Microsoft may waive such condition at any time with respect to an Exchange Offer. Any waiver of a condition by Microsoft with

\n", - "\n", - "

respect to an Exchange Offer will automatically waive such condition with respect to the corresponding Consent Solicitation.

\n", - "\n", - "\n", - "

Eligible Holders who validly tender (and do not validly withdraw) their Existing Activision Blizzard Notes at or before to 5:00 p.m., New York

\n", - "\n", - "

City time, on October 27, 2023, unless extended (the “Early Tender Date”), will be eligible to receive, on the applicable settlement date, the applicable

\n", - "\n", - "

Total Exchange Consideration as set forth in the table above for all such Existing Activision Blizzard Notes that are accepted. Eligible Holders who validly

\n", - "\n", - "

tender (and do not validly withdraw) their Existing Activision Blizzard Notes after the Early Tender Date but at or before 5:00 p.m., New York City time,

\n", - "\n", - "

on November 14, 2023, unless extended (the “Expiration Date”), will be eligible to receive, on the applicable settlement date, the applicable Exchange

\n", - "\n", - "

Consideration as set forth in the table above for all such Existing Activision Blizzard Notes that are accepted. The Early Settlement Date will be determined

\n", - "\n", - "

at Microsoft’s option and is currently expected to occur within six business days after the Early Tender Date. The Final Settlement Date will be promptly

\n", - "\n", - "

after the Expiration Date and is currently expected to occur within two business days after the Expiration Date.

\n", - "\n", - "\n", - "

The Exchange Offers and Consent Solicitations will only be made, and documents relating to the Exchange Offers and Consent Solicitations will

\n", - "\n", - "

only be distributed, to holders of Existing Activision Blizzard Notes who complete and return an eligibility letter confirming that they are persons (a) in the

\n", - "\n", - "

United States who are reasonably believed to be “qualified institutional buyers” as defined in Rule 144A under the Securities Act of 1933, as amended (the

\n", - "\n", - "

“Securities Act”), or (b) that are outside the United States who are not “U.S. persons” as defined in Rule 902 under the Securities Act and who are eligible

\n", - "\n", - "

to participate in the Exchange Offer pursuant to the laws of the applicable jurisdiction, as set forth in the eligibility letter (“Eligible Holders”).

\n", - "\n", - "\n", - "

3

\n", - "
\n", - "
\n", - "\n", - "

Eligible Holders of Existing Activision Blizzard Notes who are located in or a resident of Canada must also complete and return a Canadian

\n", - "\n", - "

supplemental eligibility letter to D.F. King & Co., Inc. (the “Information Agent” and the “Exchange Agent”) establishing its eligibility to participate in the

\n", - "\n", - "

Exchange Offers and providing supplemental information required for Canadian securities regulatory reporting purposes. Each holder of Existing

\n", - "\n", - "

Activision Blizzard Notes will, by participating in any Exchange Offer, be deemed to represent and warrant that it is not located in or a resident of any

\n", - "\n", - "

province or territory of Canada, and that it is not tendering any Existing Activision Blizzard Notes on behalf of a beneficial owner that is located in or a

\n", - "\n", - "

resident of Canada, unless either: (i) such holder has completed and returned a Canadian supplemental eligibility letter to the Information Agent, or

\n", - "\n", - "

(ii) such holder is an account manager outside Canada acting on behalf of a Canadian beneficial owner on a fully-discretionary basis, and no acts in

\n", - "\n", - "

furtherance of the exchange of such beneficial owner’s Existing Activision Blizzard Notes take place in Canada.

\n", - "\n", - "\n", - "

The complete terms and conditions of the Exchange Offers and Consent Solicitations are described in the Offering Memorandum and Consent

\n", - "\n", - "

Solicitation Statement, a copy of which may be obtained by Eligible Holders by contacting D.F. King & Co., Inc., the Exchange Agent and Information

\n", - "\n", - "

Agent in connection with the Exchange Offers and Consent Solicitations, by sending an email to MSFT-ATVI@dfking.com or by calling (866) 227-7300

\n", - "\n", - "

(U.S. toll-free) or (212) 269-5550 (banks and brokers). The eligibility letter is available electronically at: https://www.dfking.com/MSFT-ATVI.

\n", - "\n", - "\n", - "

4

\n", - "
\n", - "
\n", - "\n", - "

This press release does not constitute an offer to sell or purchase, or a solicitation of an offer to sell or purchase, or the solicitation of tenders or

\n", - "\n", - "

consents with respect to, any security. This press release should not be construed as an offer to sell or purchase, or a solicitation of an offer to sell or

\n", - "\n", - "

purchase, or the solicitation of tenders or consents with respect to, any Microsoft securities or other securities by Activision Blizzard. No offer, solicitation,

\n", - "\n", - "

purchase or sale will be made in any jurisdiction in which such an offer, solicitation, or sale would be unlawful. The Exchange Offers and Consent

\n", - "\n", - "

Solicitations are being made to Eligible Holders solely pursuant to the Offering Memorandum and Consent Solicitation Statement and only to such persons

\n", - "\n", - "

and in such jurisdictions as is permitted under applicable law.

\n", - "\n", - "\n", - "

The New Microsoft Notes have not been registered with the Securities and Exchange Commission (the “SEC”) under the Securities Act or any

\n", - "\n", - "

state or foreign securities laws. Therefore, the New Microsoft Notes may not be offered or sold in the United States or to any U.S. person absent

\n", - "\n", - "

registration, except pursuant to an applicable exemption from, or in a transaction not subject to, the registration requirements of the Securities Act. In

\n", - "\n", - "

connection with the Exchange Offers, Microsoft will enter into a registration rights agreement, pursuant to which Microsoft will be obligated to use

\n", - "\n", - "

commercially reasonable efforts to file with the SEC and cause to become effective a registration statement with respect to an offer to exchange each series

\n", - "\n", - "

of New Microsoft Notes for new notes within 450 days of the settlement date. In addition, Microsoft has agreed to use commercially reasonable efforts to

\n", - "\n", - "

file a shelf registration statement to cover resales of the New Microsoft Notes under the Securities Act in certain circumstances.

\n", - "\n", - "\n", - "

5

\n", - "
\n", - "
\n", - "\n", - "

About Microsoft

\n", - "\n", - "\n", - "

Microsoft enables digital transformation for the era of an intelligent cloud and an intelligent edge. Its mission is to empower every person and

\n", - "\n", - "

every organization on the planet to achieve more.

\n", - "\n", - "\n", - "

For More Information, Press Only:

\n", - "\n", - "\n", - "

Microsoft Media Relations, WE Communications for Microsoft, (425) 638-7777, rapidresponse@we-worldwide.com

\n", - "\n", - "\n", - "

Note to editors: For more information, news and perspectives from Microsoft, please visit the Microsoft News Center at

\n", - "\n", - "

http://news.microsoft.com. Web links, telephone numbers and titles were correct at time of publication but may have changed. For additional assistance,

\n", - "\n", - "

journalists and analysts may contact Microsoft’s Rapid Response Team or other appropriate contacts listed at https://news.microsoft.com/microsoft-public-

\n", - "\n", - "

relations-contacts.

\n", - "\n", - "\n", - "

About Activision Blizzard

\n", - "\n", - "\n", - "

Activision Blizzard’s mission is to connect and engage the world through epic entertainment. Through communities rooted in Activision

\n", - "\n", - "

Blizzard’s video games, Activision Blizzard enables hundreds of millions of people to experience joy, thrill and achievement.

\n", - "\n", - "\n", - "

Cautionary Note Regarding Forward-looking Statements

\n", - "\n", - "\n", - "

This press release includes forward-looking statements within the meaning of federal securities laws. All statements, other than statements of

\n", - "\n", - "

historical fact, included in this press release are forward-looking statements. Such forward-looking statements include, but are not limited to, statements

\n", - "\n", - "

about the timing of the Exchange Offers and Consent Solicitations. No assurances can be given that the forward-looking statements contained in this press

\n", - "\n", - "

release will occur as expected and actual results may differ materially from those included in this press release. Forward-looking statements are based on

\n", - "\n", - "

current expectations and assumptions that involve a number of risks and uncertainties that could cause actual results to differ materially from those

\n", - "\n", - "

included in this press release. Important risks, uncertainties and other factors are described in the Offering Memorandum and Consent Solicitation

\n", - "\n", - "

Statement, Microsoft’s Annual Report on Form 10-K for the fiscal year ended June 30, 2023, Activision Blizzard’s Annual Report on Form 10-K for the

\n", - "\n", - "

fiscal year ended December 31, 2022, Activision Blizzard’s Quarterly Reports on Form 10-Q for the quarterly periods ended March 31, 2023 and June 30,

\n", - "\n", - "

2023, and Current Reports on Form 8-K and other filings Microsoft and Activision Blizzard make with the SEC. Forward-looking statements are based on

\n", - "\n", - "

the estimates and opinions of management at the time the statements are made. Except to the extent required by applicable law, neither Microsoft nor

\n", - "\n", - "

Activision Blizzard undertakes any obligation to publicly update or revise any forward-looking statement, whether as a result of new information, future

\n", - "\n", - "

events or otherwise. You are cautioned not to place undue reliance on these forward-looking statements that speak only as of the date hereof.

\n", - "\n", - "\n", - "

6

\n", - "\n", - "
\n", + "
Sticky Note from Paperless
\n", + "
This is a sample page scanned at 200dpi and converted to PDF. It is not searchable. That is, all you see is the original image of the source document.
\n", + "
\n", + "
\n", + "\n", "" ], "text/plain": [ diff --git a/notebooks/test_llmsherpa_api.ipynb b/notebooks/test_llmsherpa_api.ipynb index e38d39e..18cc7db 100644 --- a/notebooks/test_llmsherpa_api.ipynb +++ b/notebooks/test_llmsherpa_api.ipynb @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 119, "id": "d765b72f-5d58-4343-9f48-432acb31b7d6", "metadata": {}, "outputs": [ @@ -48,7 +48,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/3984827310.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", + "/var/folders/10/036rtqts0zv6b9spnjgkl1hh0000gn/T/ipykernel_74969/3404817863.py:2: DeprecationWarning: Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display\n", " from IPython.core.display import display, HTML\n" ] } @@ -57,7 +57,7 @@ "from llmsherpa.readers import LayoutPDFReader\n", "from IPython.core.display import display, HTML\n", "# llmsherpa_api_url = \"https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all\"\n", - "llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all\"\n", + "llmsherpa_api_url = \"http://localhost:5001/api/parseDocument?renderFormat=all&useNewIndentParser=true\"\n", "pdf_url = \"https://arxiv.org/pdf/1910.13461.pdf\" # also allowed is a file path e.g. /home/downloads/xyz.pdf\n", "# pdf_url = \"https://www.apache.org/licenses/LICENSE-2.0.txt\"\n", "# pdf_url = \"https://microsoft.gcs-web.com/static-files/931d7780-ccfc-47e3-97ad-09d87e12b795\"\n", @@ -66,36 +66,159 @@ "# pdf_url = \"https://en.wikipedia.org/wiki/Language_model\"\n", "# pdf_url = \"https://raw.githubusercontent.com/nlmatics/llmsherpa/main/README.md\"\n", "# pdf_url = \"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id=35362092&retmode=xml\"\n", + "# pdf_url = \"https://solutions.weblite.ca/pdfocrx/scansmpl.pdf\"\n", + "do_ocr = True\n", + "if do_ocr:\n", + " llmsherpa_api_url = llmsherpa_api_url + \"&applyOcr=yes\"\n", "pdf_reader = LayoutPDFReader(llmsherpa_api_url)\n", "doc = pdf_reader.read_pdf(pdf_url)" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 120, "id": "274fc39e-a574-4312-9d44-53b7758fa961", "metadata": {}, "outputs": [ { "data": { + "text/html": [ + "

BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension

Mike Lewis*, Yinhan Liu*, Naman Goyal*, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, Luke Zettlemoyer Facebook AI

{mikelewis,yinhanliu,naman}@fb.com

Abstract

We present BART, a denoising autoencoder for pretraining sequence-to-sequence models.\n", + "BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text.\n", + "It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and many other more recent pretraining schemes.\n", + "We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of the original sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token.\n", + "BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks.\n", + "It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new stateof-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE.\n", + "BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining.\n", + "We also report ablation experiments that replicate other pretraining schemes within the BART framework, to better measure which factors most influence end-task performance.

1 Introduction

methods have achieved remarkable success in a wide range of NLP tasks (Mikolov et al., 2013; Peters et al., 2018; Devlin et al., 2019; Joshi et al., 2019; Yang et al., 2019; Liu et al., 2019).\n", + "The most successful approaches have been variants of masked language models, which are denoising autoencoders that are trained to reconstruct text where a random subset of the words has been masked out.\n", + "Recent work has shown gains by improving the distribution of masked tokens (Joshi et al., 2019), the order in which masked tokens are predicted (Yang et al., 2019), and the available context for replacing masked tokens (Dong et al., 2019).\n", + "However, these methods typically focus on particular types of end tasks (e.g. span prediction, generation, etc.), limiting their applicability.

In this paper, we present BART, which pre-trains a model combining Bidirectional and Auto-Regressive Transformers.\n", + "BART is a denoising autoencoder built with a sequence-to-sequence model that is applicable to a very wide range of end tasks.\n", + "Pretraining has two stages (1) text is corrupted with an arbitrary noising function, and (2) a sequence-to-sequence model is learned to reconstruct the original text.\n", + "BART uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and many other more recent pretraining schemes (see Figure 1).

A key advantage of this setup is the noising flexibility; arbitrary transformations can be applied to the original text, including changing its length.\n", + "We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of the original sentences and using a novel in-filling scheme, where arbitrary length spans of text (including zero length) are replaced with a single mask token.\n", + "This approach generalizes the original word masking and next sentence prediction objectives in BERT by forcing the model to reason more about overall sentence length and make longer range transformations to the input.

BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks.\n", + "It matches the performance of RoBERTa (Liu et al., 2019) with comparable training resources on GLUE (Wang et al., 2018) and SQuAD (Rajpurkar et al., 2016), and achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks.\n", + "For example, it improves performance by 6 ROUGE over previous work on XSum (Narayan et al., 2018).

BART also opens up new ways of thinking about fine tuning.\n", + "We present a new scheme for machine translation where a BART model is stacked above a few additional transformer layers.\n", + "These layers are trained to essentially translate the foreign language to noised

B D A B C D E

Autoregressive Decoder

Bidirectional Encoder

A _ C _ E
A B C D
  • (a) BERT: Random tokens are replaced with masks, and the document is encoded bidirectionally.\n", + "Missing tokens are predicted independently, so BERT cannot easily be (b) GPT: Tokens are predicted auto-regressively, meaning GPT can be used for generation.\n", + "However words can only condition on leftward context, so it cannot learn bidirec- tional interactions.
  • used for generation.

    A B C D E
    Bidirectional EncoderAutoregressive Decoder
    A _ B _ E A B C D
  • (c) BART: Inputs to the encoder need not be aligned with decoder outputs, allowing arbitary noise transformations.\n", + "Here, a document has been corrupted by replacing spans of text with mask symbols.\n", + "The corrupted document (left) is encoded with a bidirectional model, and then the likelihood of the original document (right) is calculated with an autoregressive decoder.\n", + "For fine-tuning, an uncorrupted document is input to both the encoder and decoder, and we use representations from the final hidden state of the decoder.
  • Figure 1: A schematic comparison of BART with BERT (Devlin et al., 2019) and GPT (Radford et al., 2018).

    English, by propagation through BART, thereby using BART as a pre-trained target-side language model.\n", + "This approach improves performance over a strong back-translation MT baseline by 1.1 BLEU on the WMT Romanian-English benchmark.

    To better understand these effects, we also report an ablation analysis that replicates other recently proposed training objectives.\n", + "This study allows us to carefully control for a number of factors, including data and optimization parameters, which have been shown to be as important for overall performance as the selection of training objectives (Liu et al., 2019).\n", + "We find that BART exhibits the most consistently strong performance across the full range of tasks we consider.

    2 Model

    is a denoising autoencoder that maps a corrupted document to the original document it was derived from.\n", + "It is implemented as a sequence-to-sequence model with a bidirectional encoder over corrupted text and a left-to-right autoregressive decoder.\n", + "For pre-training, we optimize the negative log likelihood of the original document.

    2.1 Architecture

    BART uses the standard sequence-to-sequence Transformer architecture from (Vaswani et al., 2017), except, following GPT, that we modify ReLU activation functions to GeLUs (Hendrycks & Gimpel, 2016) and initialise parameters from N (0, 0.02).\n", + "For our base model, we use 6 layers in the encoder and de- coder, and for our large model we use 12 layers in each.\n", + "The architecture is closely related to that used in BERT, with the following differences: (1) each layer of the decoder additionally performs cross-attention over the final hidden layer of the encoder (as in the transformer sequence-to-sequence model); and (2) BERT uses an additional feed-forward network before wordprediction, which BART does not.\n", + "In total, BART contains roughly 10% more parameters than the equivalently sized BERT model.

    2.2 Pre-training BART

    BART is trained by corrupting documents and then optimizing a reconstruction loss—the cross-entropy between the decoder’s output and the original document.\n", + "Unlike existing denoising autoencoders, which are tailored to specific noising schemes, BART allows us to apply any type of document corruption.\n", + "In the extreme case, where all information about the source is lost, BART is equivalent to a language model.

    We experiment with several previously proposed and novel transformations, but we believe there is a significant potential for development of other new alternatives.\n", + "The transformations we used are summarized below, and examples are shown in Figure 2.

    Token Masking Following BERT (Devlin et al., 2019), random tokens are sampled and replaced with [MASK] elements.

    Token Deletion Random tokens are deleted from the input.\n", + "In contrast to token masking, the model must decide which positions are missing inputs.

    A _C . _ E .D E . A B C .C . D E . A B
    Sentence PermutationDocument RotationToken Masking
    A B C . D E .A . C . E .A _ . D _ E .

    Token Deletion Text Infilling

    Figure 2: Transformations for noising the input that we experiment with.\n", + "These transformations can be composed.

    Text Infilling A number of text spans are sampled, with span lengths drawn from a Poisson distribution (λ = 3).\n", + "Each span is replaced with a single [MASK] token.\n", + "0-length spans correspond to the insertion of [MASK] tokens.\n", + "Text infilling is inspired by SpanBERT (Joshi et al., 2019), but SpanBERT samples span lengths from a different (clamped geometric) distribution, and replaces each span with a sequence of [MASK] tokens of exactly the same length.\n", + "Text infilling teaches the model to predict how many tokens are missing from a span.

    Sentence Permutation A document is divided into sentences based on full stops, and these sentences are shuffled in a random order.

    Document Rotation A token is chosen uniformly at random, and the document is rotated so that it begins with that token.\n", + "This task trains the model to identify the start of the document.

    3 Fine-tuning BART

    The representations produced by BART can be used in several ways for downstream applications.

    3.1 Sequence Classification Tasks

    For sequence classification tasks, the same input is fed into the encoder and decoder, and the final hidden state of the final decoder token is fed into new multi-class linear classifier.\n", + "This approach is related to the CLS token in BERT; however we add the additional token to the end so that representation for the token in the decoder can attend to decoder states from the complete input (Figure 3a).

    3.2 Token Classification Tasks

    For token classification tasks, such as answer endpoint classification for SQuAD, we feed the complete document into the encoder and decoder, and use the top hidden state of the decoder as a representation for each word.\n", + "This representation is used to classify the token.

    3.3 Sequence Generation Tasks

    Because BART has an autoregressive decoder, it can be directly fine tuned for sequence generation tasks such as abstractive question answering and summarization.\n", + "In both of these tasks, information is copied from the input but manipulated, which is closely related to the denoising pre-training objective.\n", + "Here, the encoder input is the input sequence, and the decoder generates outputs autoregressively.

    3.4 Machine Translation

    We also explore using BART to improve machine translation decoders for translating into English.\n", + "Previous work Edunov et al.\n", + "(2019) has shown that models can be improved by incorporating pre-trained encoders, but gains from using pre-trained language models in decoders have been limited.\n", + "We show that it is possible to use the entire BART model (both encoder and decoder) as a single pretrained decoder for machine translation, by adding a new set of encoder parameters that are learned from bitext (see Figure 3b).

    More precisely, we replace BART’s encoder embedding layer with a new randomly initialized encoder.\n", + "The model is trained end-to-end, which trains the new encoder to map foreign words into an input that BART can de-noise to English.\n", + "The new encoder can use a separate vocabulary from the original BART model.

    We train the source encoder in two steps, in both cases backpropagating the cross-entropy loss from the output of the BART model.\n", + "In the first step, we freeze most of BART parameters and only update the randomly initialized source encoder, the BART positional embeddings, and the self-attention input projection matrix of BART’s encoder first layer.\n", + "In the second step, we train all model parameters for a small number of iterations.

    4 Comparing Pre-training Objectives

    BART supports a much wider range of noising schemes during pre-training than previous work.\n", + "We compare a range of options using base-size models (6 encoder and 6 decoder layers, with a hidden size of 768), evaluated on a representative subset of the tasks we will consider for the full large scale experiments in §5.

    4.1 Comparison Objectives

    While many pre-training objectives have been proposed, fair comparisons between these have been difficult to perform, at least in part due to differences in training data, training resources, architectural differences between models, and fine-tuning procedures.\n", + "We

    A\n", + "B\n", + "C\n", + "D E

    label

    Pre-trained EncoderPre-trained Decoder
    Pre-trained EncoderPre-trained Decoder
    ABCD
    Randomly Initialized Encoder

    A B C D E A B C D E

    α\n", + "β\n", + "γ\n", + "δ ε

  • (a) To use BART for classification problems, the same input is fed into the encoder and decoder, and the repre- sentation from the final output is used.\n", + "(b) For machine translation, we learn a small additional encoder that replaces the word embeddings in BART.\n", + "The new encoder can use a disjoint vocabulary.
  • Figure 3: Fine tuning BART for classification and translation.

    re-implement strong pre-training approaches recently proposed for discriminative and generation tasks.\n", + "We aim, as much as possible, to control for differences unrelated to the pre-training objective.\n", + "However, we do make minor changes to the learning rate and usage of layer normalisation in order to improve performance (tuning these separately for each objective).\n", + "For reference, we compare our implementations with published numbers from BERT, which was also trained for 1M steps on a combination of books and Wikipedia data.\n", + "We compare the following approaches:

    Language Model Similarly to GPT (Radford et al., 2018), we train a left-to-right Transformer language model.\n", + "This model is equivalent to the BART decoder, without cross-attention.

    Permuted Language Model Based on XLNet (Yang et al., 2019), we sample 1/6 of the tokens, and generate them in a random order autoregressively.\n", + "For consistency with other models, we do not implement the relative positional embeddings or attention across segments from XLNet.

    Masked Language Model Following BERT (Devlin et al., 2019), we replace 15% of tokens with [MASK] symbols, and train the model to independently predict the original tokens.

    Multitask Masked Language Model As in UniLM (Dong et al., 2019), we train a Masked Language Model with additional self-attention masks.\n", + "Self attention masks are chosen randomly in with the follow proportions: 1/6 left-to-right, 1/6 right-to-left, 1/3 unmasked, and 1/3 with the first 50% of tokens unmasked and a left-to-right mask for the remainder.

    Masked Seq-to-Seq Inspired by MASS (Song et al., 2019), we mask a span containing 50% of tokens, and train a sequence to sequence model to predict the masked tokens.

    For the Permuted LM, Masked LM and Multitask Masked LM, we use two-stream attention (Yang et al., 2019) to efficiently compute likelihoods of the output part of the sequence (using a diagonal self-attention mask on the output to predict words left-to-right).

    We experiment with (1) treating the task as a standard sequence-to-sequence problem, where the source input to the encoder and the target is the decoder output, or (2) adding the source as prefix to the target in the decoder, with a loss only on the target part of the sequence.\n", + "We find the former works better for BART models, and the latter for other models.

    To most directly compare our models on their ability to model their fine-tuning objective (the log likelihood of the human text), we report perplexity in Table 1.

    4.2 Tasks

    SQuAD (Rajpurkar et al., 2016)a an extractive question answering task on Wikipedia paragraphs.\n", + "Answers are text spans extracted from a given document context.\n", + "Similar to BERT (Devlin et al., 2019), we use concatenated question and context as input to the encoder of BART, and additionally pass them to the decoder.\n", + "The model includes classifiers to predict the start and end indices of each token.

    MNLI (Williams et al., 2017), a bitext classification task to predict whether one sentence entails another.\n", + "The fine-tuned model concatenates the two sentences with appended an EOS token, and passes them to both the BART encoder and decoder.\n", + "In contrast to BERT, the representation of the EOS token is used to classify the sentences relations.

    ELI5 (Fan et al., 2019), a long-form abstractive question answering dataset.\n", + "Models generate answers conditioned on the concatenation of a question and supporting documents.

    XSum (Narayan et al., 2018), a news summarization dataset with highly abstractive summaries.

    ConvAI2 (Dinan et al., 2019), a dialogue response generation task, conditioned on context and a persona.

    CNN/DM (Hermann et al., 2015), a news summarization dataset.\n", + "Summaries here are typically closely related to source sentences.

    4.3 Results

    Results are shown in Table 1.\n", + "Several trends are clear:

    ModelSQuAD 1.1MNLIELI5XSumConvAI2CNN/DM
    F1 Acc PPL PPL PPL PPL
    BERT Base (Devlin et al., 2019)88.584.3----
    Masked Language Model90.083.524.777.8712.597.06
    Masked Seq2seq87.082.123.406.8011.436.19
    Language Model76.780.121.407.0011.516.56
    Permuted Language Model89.183.724.037.6912.236.96
    Multitask Masked Language Model89.282.423.737.5012.396.74
    BART Base w/ Token Masking90.484.125.057.0811.736.10
    w/ Token Deletion90.484.124.616.9011.465.87
    w/ Text Infilling90.884.024.266.6111.055.83
    w/ Document Rotation77.275.353.6917.1419.8710.59
    w/ Sentence Shuffling85.481.541.8710.9316.677.89
    w/ Text Infilling + Sentence Shuffling90.883.824.176.6211.125.41

    Table 1: Comparison of pre-training objectives.\n", + "All models are of comparable size and are trained for 1M steps on a combination of books and Wikipedia data.\n", + "Entries in the bottom two blocks are trained on identical data using the same code-base, and fine-tuned with the same procedures.\n", + "Entries in the second block are inspired by pre-training objectives proposed in previous work, but have been simplified to focus on evaluation objectives (see §4.1).\n", + "Performance varies considerably across tasks, but the BART models with text infilling demonstrate the most consistently strong performance.

    Performance of pre-training methods varies significantly across tasks The effectiveness of pre-training methods is highly dependent on the task.\n", + "For example, a simple language model achieves the best ELI5 performance, but the worst SQUAD results.

    Token masking is crucial Pre-training objectives based on rotating documents or permuting sentences perform poorly in isolation.\n", + "The successful methods either use token deletion or masking, or self-attention masks.\n", + "Deletion appears to outperform masking on generation tasks.

    Left-to-right pre-training improves generation The Masked Language Model and the Permuted Language Model perform less well than others on generation, and are the only models we consider that do not include left-to-right auto-regressive language modelling during pre-training.

    Bidirectional encoders are crucial for SQuAD As noted in previous work (Devlin et al., 2019), just left-to-right decoder performs poorly on SQuAD, because future context is crucial in classification decisions.\n", + "However, BART achieves similar performance with only half the number of bidirectional layers.

    The pre-training objective is not the only important factor Our Permuted Language Model performs less well than XLNet (Yang et al., 2019).\n", + "Some of this difference is likely due to not including other architectural improvements, such as relative-position embeddings or segment-level recurrence.

    Pure language models perform best on ELI5 The ELI5 dataset is an outlier, with much higher perplexities than other tasks, and is the only generation task where other models outperform BART.\n", + "A pure language model performs best, suggesting that BART is less effective when the output is only loosely constrained by the input.

    BART achieves the most consistently strong performance.\n", + "With the exception of ELI5, BART models using text-infilling perform well on all tasks.

    5 Large-scale Pre-training Experiments

    Recent work has shown that downstream performance can dramatically improve when pre-training is scaled to large batch sizes (Yang et al., 2019; Liu et al., 2019) and corpora.\n", + "To test how well BART performs in this regime, and to create a useful model for downstream tasks, we trained BART using the same scale as the RoBERTa model.

    5.1 Experimental Setup

    We pre-train a large model with 12 layers in each of the encoder and decoder, and a hidden size of 1024.\n", + "Following RoBERTa (Liu et al., 2019), we use a batch size of 8000, and train the model for 500000 steps.\n", + "Documents are tokenized with the same byte-pair encoding as GPT-2 (Radford et al., 2019).\n", + "Based on the results in Section §4, we use a combination of text infilling and sentence permutation.\n", + "We mask 30% of tokens in each document, and permute all sentences.\n", + "Although sentence permutation only shows significant additive gains

    SQuAD 1.1 EM/F1SQuAD 2.0 EM/F1MNLI m/mmSST AccQQP AccQNLI AccSTS-B AccRTE AccMRPC AccCoLA Mcc
    BERT84.1/90.979.0/81.886.6/-93.291.392.390.070.488.060.6
    UniLM-/-80.5/83.487.0/85.994.5-92.7-70.9-61.1
    XLNet89.0/94.586.1/88.889.8/-95.691.893.991.883.889.263.6
    RoBERTa88.9/94.686.5/89.490.2/90.296.492.294.792.486.690.968.0
    BART88.8/94.686.1/89.289.9/90.196.692.594.991.287.090.462.8

    Table 2: Results for large models on SQuAD and GLUE tasks.\n", + "BART performs comparably to RoBERTa and XLNet, suggesting that BART’s uni-directional decoder layers do not reduce performance on discriminative tasks.

    CNN/DailyMailXSumR1R2RLR1R2RL
    Lead-340.4217.6236.6716.301.6011.95
    PTGEN (See et al., 2017)36.4415.6633.4229.709.2123.24
    PTGEN+COV (See et al., 2017)39.5317.2836.3828.108.0221.72
    UniLM43.3320.2140.51---
    BERTSUMABS (Liu & Lapata, 2019)41.7219.3938.7638.7616.3331.15
    BERTSUMEXTABS (Liu & Lapata, 2019)42.1319.6039.1838.8116.5031.27
    BART44.1621.2840.9045.1422.2737.25

    Table 3: Results on two standard summarization datasets.\n", + "BART outperforms previous work on summarization on two tasks and all metrics, with gains of roughly 6 points on the more abstractive dataset.

    on the CNN/DM summarization dataset, we hypothesised that larger pre-trained models may be better able to learn from this task.\n", + "To help the model better fit the data, we disabled dropout for the final 10% of training steps.\n", + "We use the same pre-training data as Liu et al.\n", + "(2019), consisting of 160Gb of news, books, stories, and web text.

    5.2 Discriminative Tasks

    Table 2 compares the performance of BART with several recent approaches on the well-studied SQuAD and GLUE tasks (Warstadt et al., 2018; Socher et al., 2013; Dolan & Brockett, 2005; Agirre et al., 2007; Williams et al., 2018; Dagan et al., 2006; Levesque et al., 2011).

    The most directly comparable baseline is RoBERTa, which was pre-trained with the same resources, but a different objective.\n", + "Overall, BART performs similarly, with only small differences between the models on most tasks.\n", + "suggesting that BART’s improvements on generation tasks do not come at the expense of classification performance.

    5.3 Generation Tasks

    We also experiment with several text generation tasks.\n", + "BART is fine-tuned as a standard sequence-to-sequence model from the input to the output text.\n", + "During finetuning we use a label smoothed cross entropy loss (Pereyra et al., 2017), with the smoothing parameter set to 0.1.\n", + "During generation, we set beam size as 5, remove duplicated trigrams in beam search, and tuned the model with min-len, max-len, length penalty on the validation set (Fan et al., 2017).

    ConvAI2Valid F1 Valid PPLSeq2Seq + Attention16.0235.07
    Best System19.0917.51
    BART20.7211.85

    Table 4: BART outperforms previous work on conversational response generation.\n", + "Perplexities are renormalized based on official tokenizer for ConvAI2.

    Summarization To provide a comparison with the state-of-the-art in summarization, we present results on two summarization datasets, CNN/DailyMail and XSum, which have distinct properties.

    Summaries in the CNN/DailyMail tend to resemble source sentences.\n", + "Extractive models do well here, and even the baseline of the first-three source sentences is highly competitive.\n", + "Nevertheless, BART outperforms all existing work.

    In contrast, XSum is highly abstractive, and extractive models perform poorly.\n", + "BART outperforms the best previous work, which leverages BERT, by roughly 6.0 points on all ROUGE metrics—representing a significant advance in performance on this problem.\n", + "Qualitatively, sample quality is high (see §6).

    Dialogue We evaluate dialogue response generation on CONVAI2 (Dinan et al., 2019), in which agents must generate responses conditioned on both the previous context and a textually-specified persona.\n", + "BART outperforms previous work on two automated metrics.

    R2 RL

    Best Extractive23.53.117.5
    Language Model Seq2Seq27.8 28.34.7 5.123.1 22.8
    Seq2Seq Multitask BART28.9 30.65.4 6.223.1 24.3

    Table 5: BART achieves state-of-the-art results on the challenging ELI5 abstractive question answering dataset.\n", + "Comparison models are from Fan et al.\n", + "(2019).

    RO-EN
    7 Related Work
    Baseline36.80
    Fixed BART 36.29 Tuned BART 37.96
    Table 6: The performance (BLEU) of baseline and BART on WMT’16 RO-EN augmented with backtranslation data. BART improves over a strong backtranslation (BT) baseline by using monolingual English pre-training.
    Abstractive QA We use the recently proposed ELI5 dataset to test the model’s ability to generate long freeform answers. We find BART outperforms the best previous work by 1.2 ROUGE-L, but the dataset remains a challenging, because answers are only weakly specified by the question.
    5.4 Translation
    We also evaluated performance on WMT16 RomanianEnglish, augmented with back-translation data from Sennrich et al. (2016). We use a 6-layer transformer source encoder to map Romanian into a representation that BART is able to de-noise into English, following the approach introduced in §3.4. Experiment results are presented in Table 6. We compare our results against a baseline Transformer architecture (Vaswani et al., 2017) with Transformerlarge settings (the baseline row). We show the performance of both steps of our model in the fixed BART and tuned BART rows. For each row we experiment on the original WMT16 Romanian-English augmented with back-translation data. We use a beam width of 5 and a length penalty of α = 1. Preliminary results suggested that our approach was less effective without back-translation data, and prone to overfitting—future work should explore additional regularization techniques.
    6 Qualitative Analysis
    BART shows large improvements on summarization metrics, of up to 6 points over the prior state-of-the-art. To understand BART’s performance beyond automated metrics, we analyse its generations qualitatively.
    Table 7 shows example summaries generated by BART. Examples are taken from WikiNews articles published after the creation of the pre-training corpus, to eliminate the possibility of the events described being present in the model’s training data. Following Narayan et al. (2018), we remove the first sentence of the article prior to summarizing it, so there is no easy extractive summary of the document.
    Unsurprisingly, model output is fluent and grammatical English. However, model output is also highly abstractive, with few phrases copied from the input. The output is also generally factually accurate, and integrates supporting evidence from across the input document with background knowledge (for example, correctly completing names, or inferring that PG&E operates in California). In the first example, inferring that fish are protecting reefs from global warming requires non-trivial inference from the text. However, the claim that the work was published in Science is not supported by the source.
    These samples demonstrate that the BART pretraining has learned a strong combination of natural language understanding and generation.
    Early methods for pretraining were based on language models. GPT (Radford et al., 2018) only models left- ward context, which is problematic for some tasks. ELMo (Peters et al., 2018) concatenates left-only and right-only representations, but does not pre-train inter- actions between these features. Radford et al. (2019) demonstrated that very large language models can act as unsupervised multitask models.
    BERT (Devlin et al., 2019) introduced masked lan- guage modelling, which allows pre-training to learn in- teractions between left and right context words. Re- cent work has shown that very strong performance can be achieved by training for longer (Liu et al., 2019), by tying parameters across layers (Lan et al., 2019), and by masking spans instead of words (Joshi et al., 2019). Predictions are not made auto-regressively, re- ducing the effectiveness of BERT for generation tasks.
    UniLM (Dong et al., 2019) fine-tunes BERT with an ensemble of masks, some of which allow only leftward context. Like BART, this allows UniLM to be used for both generative and discriminative tasks. A difference is that UniLM predictions are conditionally indepen- dent, whereas BART’s are autoregressive. BART re- duces the mismatch between pre-training and genera- tion tasks, because the decoder is always trained on un- corrupted context.
    MASS (Song et al., 2019) is perhaps the most similar model to BART. An input sequence where a contiguous span of tokens is masked is mapped to a sequence con- sisting of the missing tokens. MASS is less effective for discriminative tasks, because disjoint sets of tokens are fed into the encoder and decoder.
    XL-Net (Yang et al., 2019) extends BERT by pre-
    Source Document (abbreviated)BART Summary
    The researchers examined three types of coral in reefs off the coast of Fiji The researchers found when fish were plentiful, they would eat algae and seaweed off the corals, which appeared to leave them more resistant to the bacterium Vibrio coralliilyti- cus, a bacterium associated with bleaching. The researchers sug- gested the algae, like warming temperatures, might render the corals’ chemical defenses less effective, and the fish were pro- tecting the coral by removing the algae.Fisheries off the coast of Fiji are protect- ing coral reefs from the effects of global warming, according to a study in the jour- nal Science.
    Sacoolas, who has immunity as a diplomat’s wife, was involved in a traffic collision Prime Minister Johnson was questioned about the case while speaking to the press at a hospital in Wat- ford. He said, “I hope that Anne Sacoolas will come back if we can’t resolve it then of course I will be raising it myself personally with the White House.”Boris Johnson has said he will raise the is- sue of US diplomat Anne Sacoolas’ diplo- matic immunity with the White House.
    According to Syrian state media, government forces began de- ploying into previously SDF controlled territory yesterday. On October 6, US President Donald Trump and Turkish Presi- dent Recep Tayyip Erdoan spoke on the phone. Then both na- tions issued statements speaking of an imminent incursion into northeast Syria . On Wednesday, Turkey began a military offensive with airstrikes followed by a ground invasion. Syrian government forces have entered territory held by the US-backed Syrian Democratic Forces (SDF) in response to Turkey’s incursion into the region.
    This is the first time anyone has been recorded to run a full marathon of 42.195 kilometers (approximately 26 miles) under this pursued landmark time. It was not, however, an officially sanctioned world record, as it was not an”open race” of the IAAF. His time was 1 hour 59 minutes 40.2 seconds. Kipchoge ran in Vienna, Austria. It was an event specifically designed to help Kipchoge break the two hour barrier.Kenyan runner Eliud Kipchoge has run a marathon in less than two hours.
    PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.Power has been turned off to millions of customers in California as part of a power shutoff plan.

    Table 7: Example summaries from the XSum-tuned BART model on WikiNews articles.\n", + "For clarity, only relevant excerpts of the source are shown.\n", + "Summaries combine information from across the article and prior knowledge.

    dicting masked tokens auto-regressively in a permuted order.\n", + "This objective allows predictions to condition on both left and right context.\n", + "In contrast, the BART decoder works left-to-right during pre-training, matching the setting during generation.

    Several papers have explored using pre-trained representations to improve machine translation.\n", + "The largest improvements have come from pre-training on both source and target languages (Song et al., 2019; Lample & Conneau, 2019), but this requires pretraining on all languages of interest.\n", + "Other work has shown that encoders can be improved using pre-trained representations (Edunov et al., 2019), but gains in decoders are more limited.\n", + "We show how BART can be used to improve machine translation decoders.

    " + ], "text/plain": [ - "{'bbox': [179.52, 165.33, 421.01, 177.29000000000002],\n", - " 'block_class': 'cls_5',\n", - " 'block_idx': 2,\n", - " 'level': 1,\n", - " 'page_idx': 0,\n", - " 'sentences': ['{mikelewis,yinhanliu,naman}@fb.com'],\n", - " 'tag': 'header'}" + "" ] }, - "execution_count": 74, + "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n", - "doc.sections()[1].block_json" + "HTML(doc.sections()[0].to_html(include_children=True, recurse=True))\n", + "# doc.sections()[1].block_json\n", + "# doc.sections()[0].to_text()" ] } ], diff --git a/setup.py b/setup.py index 945a0e3..7fc0e90 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( name='nlm-ingestor', - version='0.1.2', + version='0.1.3', description='Parsers and ingestors for different file types and formats', long_description=open('README.md').read(), long_description_content_type='text/markdown',