diff --git a/hipe2022-datasets-stats.ipynb b/hipe2022-datasets-stats.ipynb index 68ace6b..ab95687 100644 --- a/hipe2022-datasets-stats.ipynb +++ b/hipe2022-datasets-stats.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 46, "id": "39ac9869", "metadata": {}, "outputs": [], @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 47, "id": "a7f403d6", "metadata": {}, "outputs": [], @@ -77,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 48, "id": "0f6a13c1", "metadata": {}, "outputs": [], @@ -106,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 49, "id": "f1d7980b", "metadata": {}, "outputs": [], @@ -114,14 +114,17 @@ "# EN\n", "ajmc_train_en_path = os.path.join(HIPE2022_data_path, f\"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-train-en.tsv\")\n", "ajmc_dev_en_path = os.path.join(HIPE2022_data_path, f\"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-en.tsv\")\n", + "ajmc_test_en_path = os.path.join(HIPE2022_data_path, f\"ajmc/en/HIPE-2022-{RELEASE_VERSION}-ajmc-test-en.tsv\")\n", "\n", "# DE\n", "ajmc_train_de_path = os.path.join(HIPE2022_data_path, f\"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-train-de.tsv\")\n", "ajmc_dev_de_path = os.path.join(HIPE2022_data_path, f\"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-de.tsv\")\n", + "ajmc_test_de_path = os.path.join(HIPE2022_data_path, f\"ajmc/de/HIPE-2022-{RELEASE_VERSION}-ajmc-test-de.tsv\")\n", "\n", "# FR\n", "ajmc_train_fr_path = os.path.join(HIPE2022_data_path, f\"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-train-fr.tsv\")\n", - "ajmc_dev_fr_path = os.path.join(HIPE2022_data_path, f\"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-fr.tsv\")" + "ajmc_dev_fr_path = os.path.join(HIPE2022_data_path, f\"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-dev-fr.tsv\")\n", + "ajmc_test_fr_path = os.path.join(HIPE2022_data_path, f\"ajmc/fr/HIPE-2022-{RELEASE_VERSION}-ajmc-test-fr.tsv\")" ] }, { @@ -142,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 50, "id": "d7d48341", "metadata": {}, "outputs": [], @@ -153,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 51, "id": "d4d2e96e", "metadata": {}, "outputs": [ @@ -223,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 52, "id": "9c59de36", "metadata": {}, "outputs": [], @@ -234,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 53, "id": "27ed577b", "metadata": {}, "outputs": [ @@ -314,6 +317,87 @@ "print(describe_dataset(documents=ajmc_train_en_docs))" ] }, + { + "cell_type": "markdown", + "id": "ddd117b3", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "8c60c253", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "ajmc_test_en_docs = parse_tsv(file_path=ajmc_test_en_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "b9126e02", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/ajmc/en/HIPE-2022-v2.1-ajmc-test-en.tsv \n", + "Number of documents: 13 \n", + "Number of entities: {'coarse_lit': 348, 'fine_lit': 348} \n", + "Number of tokens: 6052 \n", + "Entity breakdown by type: coarse_lit\n", + "+-------+---------+\n", + "| | count |\n", + "+=======+=========+\n", + "| date | 3 |\n", + "+-------+---------+\n", + "| loc | 3 |\n", + "+-------+---------+\n", + "| pers | 96 |\n", + "+-------+---------+\n", + "| scope | 151 |\n", + "+-------+---------+\n", + "| work | 95 |\n", + "+-------+---------+\n", + "fine_lit\n", + "+--------------+---------+\n", + "| | count |\n", + "+==============+=========+\n", + "| date | 3 |\n", + "+--------------+---------+\n", + "| loc | 3 |\n", + "+--------------+---------+\n", + "| pers.author | 47 |\n", + "+--------------+---------+\n", + "| pers.editor | 4 |\n", + "+--------------+---------+\n", + "| pers.myth | 41 |\n", + "+--------------+---------+\n", + "| pers.other | 4 |\n", + "+--------------+---------+\n", + "| scope | 151 |\n", + "+--------------+---------+\n", + "| work.fragm | 2 |\n", + "+--------------+---------+\n", + "| work.primlit | 83 |\n", + "+--------------+---------+\n", + "| work.seclit | 10 |\n", + "+--------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=ajmc_test_en_docs))" + ] + }, { "cell_type": "markdown", "id": "85a78a1a", @@ -332,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 56, "id": "e10a9e0b", "metadata": {}, "outputs": [], @@ -343,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 57, "id": "7b3fd754", "metadata": {}, "outputs": [ @@ -417,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 58, "id": "8da40406", "metadata": {}, "outputs": [], @@ -428,7 +512,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 59, "id": "af8ae6f5", "metadata": {}, "outputs": [ @@ -508,6 +592,81 @@ "print(describe_dataset(documents=ajmc_train_de_docs))" ] }, + { + "cell_type": "markdown", + "id": "71fe8a79", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "f0c87457", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "ajmc_test_de_docs = parse_tsv(file_path=ajmc_test_de_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "f42c1222", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/ajmc/de/HIPE-2022-v2.1-ajmc-test-de.tsv \n", + "Number of documents: 16 \n", + "Number of entities: {'coarse_lit': 382, 'fine_lit': 382} \n", + "Number of tokens: 4845 \n", + "Entity breakdown by type: coarse_lit\n", + "+--------+---------+\n", + "| | count |\n", + "+========+=========+\n", + "| loc | 2 |\n", + "+--------+---------+\n", + "| object | 2 |\n", + "+--------+---------+\n", + "| pers | 128 |\n", + "+--------+---------+\n", + "| scope | 176 |\n", + "+--------+---------+\n", + "| work | 74 |\n", + "+--------+---------+\n", + "fine_lit\n", + "+----------------+---------+\n", + "| | count |\n", + "+================+=========+\n", + "| loc | 2 |\n", + "+----------------+---------+\n", + "| object.manuscr | 2 |\n", + "+----------------+---------+\n", + "| pers.author | 48 |\n", + "+----------------+---------+\n", + "| pers.editor | 4 |\n", + "+----------------+---------+\n", + "| pers.myth | 76 |\n", + "+----------------+---------+\n", + "| scope | 176 |\n", + "+----------------+---------+\n", + "| work.primlit | 74 |\n", + "+----------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=ajmc_test_de_docs))" + ] + }, { "cell_type": "markdown", "id": "f498af21", @@ -526,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 62, "id": "cc683514", "metadata": {}, "outputs": [], @@ -537,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 63, "id": "0644243e", "metadata": {}, "outputs": [ @@ -595,7 +754,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 64, "id": "de15545d", "metadata": {}, "outputs": [], @@ -606,7 +765,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 65, "id": "786a76a5", "metadata": {}, "outputs": [ @@ -684,6 +843,87 @@ "print(describe_dataset(documents=ajmc_train_fr_docs))" ] }, + { + "cell_type": "markdown", + "id": "1e143ac9", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "4e0e0e7e", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "ajmc_test_fr_docs = parse_tsv(file_path=ajmc_test_fr_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "20ecae70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/ajmc/fr/HIPE-2022-v2.1-ajmc-test-fr.tsv \n", + "Number of documents: 15 \n", + "Number of entities: {'coarse_lit': 360, 'fine_lit': 360} \n", + "Number of tokens: 5390 \n", + "Entity breakdown by type: coarse_lit\n", + "+-------+---------+\n", + "| | count |\n", + "+=======+=========+\n", + "| date | 3 |\n", + "+-------+---------+\n", + "| loc | 9 |\n", + "+-------+---------+\n", + "| pers | 139 |\n", + "+-------+---------+\n", + "| scope | 129 |\n", + "+-------+---------+\n", + "| work | 80 |\n", + "+-------+---------+\n", + "fine_lit\n", + "+--------------+---------+\n", + "| | count |\n", + "+==============+=========+\n", + "| date | 3 |\n", + "+--------------+---------+\n", + "| loc | 9 |\n", + "+--------------+---------+\n", + "| pers.author | 78 |\n", + "+--------------+---------+\n", + "| pers.editor | 35 |\n", + "+--------------+---------+\n", + "| pers.myth | 14 |\n", + "+--------------+---------+\n", + "| pers.other | 12 |\n", + "+--------------+---------+\n", + "| scope | 129 |\n", + "+--------------+---------+\n", + "| work.fragm | 1 |\n", + "+--------------+---------+\n", + "| work.primlit | 78 |\n", + "+--------------+---------+\n", + "| work.seclit | 1 |\n", + "+--------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=ajmc_test_fr_docs))" + ] + }, { "cell_type": "markdown", "id": "8d2a85c1", @@ -704,21 +944,25 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 68, "id": "f992992b", "metadata": {}, "outputs": [], "source": [ "# EN\n", "hipe2020_dev_en_path = os.path.join(HIPE2022_data_path, f\"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-en.tsv\")\n", + "hipe2020_test_en_path = os.path.join(HIPE2022_data_path, f\"hipe2020/en/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-en.tsv\")\n", "\n", "# DE\n", "hipe2020_train_de_path = os.path.join(HIPE2022_data_path, f\"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-de.tsv\")\n", "hipe2020_dev_de_path = os.path.join(HIPE2022_data_path, f\"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-de.tsv\")\n", + "hipe2020_test_de_path = os.path.join(HIPE2022_data_path, f\"hipe2020/de/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-de.tsv\")\n", + "\n", "\n", "# FR\n", "hipe2020_dev_fr_path = os.path.join(HIPE2022_data_path, f\"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-dev-fr.tsv\")\n", - "hipe2020_train_fr_path = os.path.join(HIPE2022_data_path, f\"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-fr.tsv\")" + "hipe2020_train_fr_path = os.path.join(HIPE2022_data_path, f\"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-train-fr.tsv\")\n", + "hipe2020_test_fr_path = os.path.join(HIPE2022_data_path, f\"hipe2020/fr/HIPE-2022-{RELEASE_VERSION}-hipe2020-test-fr.tsv\")" ] }, { @@ -739,7 +983,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 69, "id": "5980486f", "metadata": {}, "outputs": [], @@ -750,7 +994,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 70, "id": "e1ebf584", "metadata": {}, "outputs": [ @@ -794,6 +1038,71 @@ "print(describe_dataset(documents=hipe2020_dev_en_docs))" ] }, + { + "cell_type": "markdown", + "id": "b8f5ffdc", + "metadata": {}, + "source": [ + "#### Test\n" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "09095db0", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "hipe2020_test_en_docs = parse_tsv(file_path=hipe2020_test_en_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "f5bc2a16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/hipe2020/en/HIPE-2022-v2.1-hipe2020-test-en.tsv \n", + "Number of documents: 46 \n", + "Number of entities: {'coarse_lit': 449, 'coarse_meto': 25} \n", + "Number of tokens: 16634 \n", + "Entity breakdown by type: coarse_lit\n", + "+------+---------+\n", + "| | count |\n", + "+======+=========+\n", + "| loc | 181 |\n", + "+------+---------+\n", + "| org | 76 |\n", + "+------+---------+\n", + "| pers | 156 |\n", + "+------+---------+\n", + "| prod | 19 |\n", + "+------+---------+\n", + "| time | 17 |\n", + "+------+---------+\n", + "coarse_meto\n", + "+-----+---------+\n", + "| | count |\n", + "+=====+=========+\n", + "| loc | 3 |\n", + "+-----+---------+\n", + "| org | 22 |\n", + "+-----+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=hipe2020_test_en_docs))" + ] + }, { "cell_type": "markdown", "id": "374f4ccb", @@ -812,7 +1121,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 73, "id": "ee918688", "metadata": {}, "outputs": [], @@ -823,7 +1132,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 74, "id": "ca1b34aa", "metadata": {}, "outputs": [ @@ -967,7 +1276,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 75, "id": "d3f07e60", "metadata": {}, "outputs": [], @@ -978,7 +1287,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 76, "id": "681ff82c", "metadata": {}, "outputs": [ @@ -1128,35 +1437,27 @@ }, { "cell_type": "markdown", - "id": "3d98272e", - "metadata": {}, - "source": [ - "### hipe2020 FR" - ] - }, - { - "cell_type": "markdown", - "id": "070880bf", + "id": "14039039", "metadata": {}, "source": [ - "#### Dev" + "#### Test\n" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "eb508bb1", + "execution_count": 77, + "id": "cb729ef9", "metadata": {}, "outputs": [], "source": [ "# parse the TSV into a list of `HipeDocument` objects\n", - "hipe2020_dev_fr_docs = parse_tsv(file_path=hipe2020_dev_fr_path)" + "hipe2020_test_de_docs = parse_tsv(file_path=hipe2020_test_de_path)" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "1eac079a", + "execution_count": 78, + "id": "dc7c8dec", "metadata": {}, "outputs": [ { @@ -1164,35 +1465,196 @@ "output_type": "stream", "text": [ "\n", - "Path of the TSV file: ./data/v2.1/hipe2020/fr/HIPE-2022-v2.1-hipe2020-dev-fr.tsv \n", - "Number of documents: 43 \n", - "Number of entities: {'coarse_lit': 1729, 'coarse_meto': 108, 'fine_lit': 1729, 'fine_meto': 108, 'fine_comp': 724, 'nested': 91} \n", - "Number of tokens: 37953 \n", + "Path of the TSV file: ./data/v2.1/hipe2020/de/HIPE-2022-v2.1-hipe2020-test-de.tsv \n", + "Number of documents: 49 \n", + "Number of entities: {'coarse_lit': 1147, 'coarse_meto': 118, 'fine_lit': 1147, 'fine_meto': 118, 'fine_comp': 431, 'nested': 73} \n", + "Number of tokens: 30737 \n", "Entity breakdown by type: coarse_lit\n", "+------+---------+\n", "| | count |\n", "+======+=========+\n", - "| loc | 774 |\n", + "| loc | 595 |\n", "+------+---------+\n", - "| org | 159 |\n", + "| org | 130 |\n", "+------+---------+\n", - "| pers | 679 |\n", + "| pers | 311 |\n", "+------+---------+\n", - "| prod | 49 |\n", + "| prod | 62 |\n", "+------+---------+\n", - "| time | 68 |\n", + "| time | 49 |\n", "+------+---------+\n", "coarse_meto\n", - "+-----+---------+\n", - "| | count |\n", - "+=====+=========+\n", - "| loc | 3 |\n", - "+-----+---------+\n", - "| org | 105 |\n", - "+-----+---------+\n", - "fine_lit\n", - "+------------------------+---------+\n", - "| | count |\n", + "+------+---------+\n", + "| | count |\n", + "+======+=========+\n", + "| loc | 1 |\n", + "+------+---------+\n", + "| org | 116 |\n", + "+------+---------+\n", + "| pers | 1 |\n", + "+------+---------+\n", + "fine_lit\n", + "+------------------------+---------+\n", + "| | count |\n", + "+========================+=========+\n", + "| loc.adm.nat | 160 |\n", + "+------------------------+---------+\n", + "| loc.adm.reg | 84 |\n", + "+------------------------+---------+\n", + "| loc.adm.sup | 21 |\n", + "+------------------------+---------+\n", + "| loc.adm.town | 257 |\n", + "+------------------------+---------+\n", + "| loc.fac | 14 |\n", + "+------------------------+---------+\n", + "| loc.oro | 5 |\n", + "+------------------------+---------+\n", + "| loc.phys.astro | 10 |\n", + "+------------------------+---------+\n", + "| loc.phys.geo | 14 |\n", + "+------------------------+---------+\n", + "| loc.phys.hydro | 29 |\n", + "+------------------------+---------+\n", + "| loc.unk | 1 |\n", + "+------------------------+---------+\n", + "| org.adm | 29 |\n", + "+------------------------+---------+\n", + "| org.ent | 83 |\n", + "+------------------------+---------+\n", + "| org.ent.pressagency | 18 |\n", + "+------------------------+---------+\n", + "| pers.ind | 309 |\n", + "+------------------------+---------+\n", + "| pers.ind.articleauthor | 2 |\n", + "+------------------------+---------+\n", + "| prod.doctr | 5 |\n", + "+------------------------+---------+\n", + "| prod.media | 57 |\n", + "+------------------------+---------+\n", + "| time.date.abs | 49 |\n", + "+------------------------+---------+\n", + "fine_meto\n", + "+-------------+---------+\n", + "| | count |\n", + "+=============+=========+\n", + "| loc.adm.nat | 1 |\n", + "+-------------+---------+\n", + "| org.adm | 116 |\n", + "+-------------+---------+\n", + "| pers.ind | 1 |\n", + "+-------------+---------+\n", + "fine_comp\n", + "+----------------+---------+\n", + "| | count |\n", + "+================+=========+\n", + "| comp.demonym | 17 |\n", + "+----------------+---------+\n", + "| comp.function | 66 |\n", + "+----------------+---------+\n", + "| comp.name | 214 |\n", + "+----------------+---------+\n", + "| comp.qualifier | 33 |\n", + "+----------------+---------+\n", + "| comp.title | 101 |\n", + "+----------------+---------+\n", + "nested\n", + "+----------------+---------+\n", + "| | count |\n", + "+================+=========+\n", + "| loc.adm.nat | 4 |\n", + "+----------------+---------+\n", + "| loc.adm.reg | 4 |\n", + "+----------------+---------+\n", + "| loc.adm.sup | 1 |\n", + "+----------------+---------+\n", + "| loc.adm.town | 45 |\n", + "+----------------+---------+\n", + "| loc.phys.hydro | 9 |\n", + "+----------------+---------+\n", + "| org.adm | 2 |\n", + "+----------------+---------+\n", + "| org.ent | 4 |\n", + "+----------------+---------+\n", + "| pers.ind | 3 |\n", + "+----------------+---------+\n", + "| prod.media | 1 |\n", + "+----------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=hipe2020_test_de_docs))" + ] + }, + { + "cell_type": "markdown", + "id": "3d98272e", + "metadata": {}, + "source": [ + "### hipe2020 FR" + ] + }, + { + "cell_type": "markdown", + "id": "070880bf", + "metadata": {}, + "source": [ + "#### Dev" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "eb508bb1", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "hipe2020_dev_fr_docs = parse_tsv(file_path=hipe2020_dev_fr_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "1eac079a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/hipe2020/fr/HIPE-2022-v2.1-hipe2020-dev-fr.tsv \n", + "Number of documents: 43 \n", + "Number of entities: {'coarse_lit': 1729, 'coarse_meto': 108, 'fine_lit': 1729, 'fine_meto': 108, 'fine_comp': 724, 'nested': 91} \n", + "Number of tokens: 37953 \n", + "Entity breakdown by type: coarse_lit\n", + "+------+---------+\n", + "| | count |\n", + "+======+=========+\n", + "| loc | 774 |\n", + "+------+---------+\n", + "| org | 159 |\n", + "+------+---------+\n", + "| pers | 679 |\n", + "+------+---------+\n", + "| prod | 49 |\n", + "+------+---------+\n", + "| time | 68 |\n", + "+------+---------+\n", + "coarse_meto\n", + "+-----+---------+\n", + "| | count |\n", + "+=====+=========+\n", + "| loc | 3 |\n", + "+-----+---------+\n", + "| org | 105 |\n", + "+-----+---------+\n", + "fine_lit\n", + "+------------------------+---------+\n", + "| | count |\n", "+========================+=========+\n", "| loc.add.phys | 1 |\n", "+------------------------+---------+\n", @@ -1301,7 +1763,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 81, "id": "e21ac6f4", "metadata": {}, "outputs": [], @@ -1312,7 +1774,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 82, "id": "5824fc8b", "metadata": {}, "outputs": [ @@ -1456,6 +1918,157 @@ "print(describe_dataset(documents=hipe2020_train_fr_docs))" ] }, + { + "cell_type": "markdown", + "id": "cc5bd52d", + "metadata": {}, + "source": [ + "#### Test\n" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ff3b0737", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "hipe2020_test_fr_docs = parse_tsv(file_path=hipe2020_test_fr_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "d7da663f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/hipe2020/fr/HIPE-2022-v2.1-hipe2020-test-fr.tsv \n", + "Number of documents: 43 \n", + "Number of entities: {'coarse_lit': 1600, 'coarse_meto': 112, 'fine_lit': 1600, 'fine_meto': 112, 'fine_comp': 709, 'nested': 82} \n", + "Number of tokens: 40854 \n", + "Entity breakdown by type: coarse_lit\n", + "+------+---------+\n", + "| | count |\n", + "+======+=========+\n", + "| loc | 854 |\n", + "+------+---------+\n", + "| org | 130 |\n", + "+------+---------+\n", + "| pers | 502 |\n", + "+------+---------+\n", + "| prod | 61 |\n", + "+------+---------+\n", + "| time | 53 |\n", + "+------+---------+\n", + "coarse_meto\n", + "+------+---------+\n", + "| | count |\n", + "+======+=========+\n", + "| org | 111 |\n", + "+------+---------+\n", + "| time | 1 |\n", + "+------+---------+\n", + "fine_lit\n", + "+------------------------+---------+\n", + "| | count |\n", + "+========================+=========+\n", + "| loc.adm.nat | 151 |\n", + "+------------------------+---------+\n", + "| loc.adm.reg | 147 |\n", + "+------------------------+---------+\n", + "| loc.adm.sup | 19 |\n", + "+------------------------+---------+\n", + "| loc.adm.town | 446 |\n", + "+------------------------+---------+\n", + "| loc.fac | 18 |\n", + "+------------------------+---------+\n", + "| loc.oro | 19 |\n", + "+------------------------+---------+\n", + "| loc.phys.geo | 28 |\n", + "+------------------------+---------+\n", + "| loc.phys.hydro | 23 |\n", + "+------------------------+---------+\n", + "| loc.unk | 3 |\n", + "+------------------------+---------+\n", + "| org.adm | 43 |\n", + "+------------------------+---------+\n", + "| org.ent | 67 |\n", + "+------------------------+---------+\n", + "| org.ent.pressagency | 20 |\n", + "+------------------------+---------+\n", + "| pers.coll | 5 |\n", + "+------------------------+---------+\n", + "| pers.ind | 487 |\n", + "+------------------------+---------+\n", + "| pers.ind.articleauthor | 10 |\n", + "+------------------------+---------+\n", + "| prod.doctr | 3 |\n", + "+------------------------+---------+\n", + "| prod.media | 58 |\n", + "+------------------------+---------+\n", + "| time.date.abs | 53 |\n", + "+------------------------+---------+\n", + "fine_meto\n", + "+---------------+---------+\n", + "| | count |\n", + "+===============+=========+\n", + "| org.adm | 106 |\n", + "+---------------+---------+\n", + "| org.ent | 5 |\n", + "+---------------+---------+\n", + "| time.date.abs | 1 |\n", + "+---------------+---------+\n", + "fine_comp\n", + "+----------------+---------+\n", + "| | count |\n", + "+================+=========+\n", + "| comp.demonym | 10 |\n", + "+----------------+---------+\n", + "| comp.function | 110 |\n", + "+----------------+---------+\n", + "| comp.name | 351 |\n", + "+----------------+---------+\n", + "| comp.qualifier | 20 |\n", + "+----------------+---------+\n", + "| comp.title | 218 |\n", + "+----------------+---------+\n", + "nested\n", + "+----------------+---------+\n", + "| | count |\n", + "+================+=========+\n", + "| loc.adm.nat | 8 |\n", + "+----------------+---------+\n", + "| loc.adm.reg | 9 |\n", + "+----------------+---------+\n", + "| loc.adm.town | 40 |\n", + "+----------------+---------+\n", + "| loc.phys.geo | 1 |\n", + "+----------------+---------+\n", + "| loc.phys.hydro | 4 |\n", + "+----------------+---------+\n", + "| org.adm | 3 |\n", + "+----------------+---------+\n", + "| org.ent | 6 |\n", + "+----------------+---------+\n", + "| pers.ind | 9 |\n", + "+----------------+---------+\n", + "| prod.media | 2 |\n", + "+----------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=hipe2020_test_fr_docs))" + ] + }, { "cell_type": "markdown", "id": "f0b06c25", @@ -1476,13 +2089,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 85, "id": "61dec1da", "metadata": {}, "outputs": [], "source": [ "letemps_dev_fr_path = os.path.join(HIPE2022_data_path, f\"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-dev-fr.tsv\")\n", - "letemps_train_fr_path = os.path.join(HIPE2022_data_path, f\"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-train-fr.tsv\")" + "letemps_train_fr_path = os.path.join(HIPE2022_data_path, f\"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-train-fr.tsv\")\n", + "letemps_test_fr_path = os.path.join(HIPE2022_data_path, f\"letemps/fr/HIPE-2022-{RELEASE_VERSION}-letemps-test-fr.tsv\")" ] }, { @@ -1495,7 +2109,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 86, "id": "ceedad6b", "metadata": {}, "outputs": [], @@ -1506,7 +2120,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 87, "id": "cae0932b", "metadata": {}, "outputs": [ @@ -1592,7 +2206,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 88, "id": "cc6d3eee", "metadata": {}, "outputs": [], @@ -1603,7 +2217,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 89, "id": "867d6f04", "metadata": {}, "outputs": [ @@ -1693,6 +2307,103 @@ "print(describe_dataset(documents=letemps_train_fr_docs))" ] }, + { + "cell_type": "markdown", + "id": "3b23248f", + "metadata": {}, + "source": [ + "#### Dev" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "bafec598", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "letemps_test_fr_docs = parse_tsv(file_path=letemps_test_fr_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "91bc22b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/letemps/fr/HIPE-2022-v2.1-letemps-test-fr.tsv \n", + "Number of documents: 51 \n", + "Number of entities: {'coarse_lit': 1017, 'fine_lit': 1017, 'nested': 12} \n", + "Number of tokens: 48468 \n", + "Entity breakdown by type: coarse_lit\n", + "+------+---------+\n", + "| | count |\n", + "+======+=========+\n", + "| loc | 591 |\n", + "+------+---------+\n", + "| org | 79 |\n", + "+------+---------+\n", + "| pers | 347 |\n", + "+------+---------+\n", + "fine_lit\n", + "+----------------+---------+\n", + "| | count |\n", + "+================+=========+\n", + "| loc | 16 |\n", + "+----------------+---------+\n", + "| loc.add.phys | 8 |\n", + "+----------------+---------+\n", + "| loc.adm | 10 |\n", + "+----------------+---------+\n", + "| loc.adm.nat | 155 |\n", + "+----------------+---------+\n", + "| loc.adm.reg | 45 |\n", + "+----------------+---------+\n", + "| loc.adm.town | 311 |\n", + "+----------------+---------+\n", + "| loc.admin.sup | 3 |\n", + "+----------------+---------+\n", + "| loc.phys | 6 |\n", + "+----------------+---------+\n", + "| loc.phys.geo | 26 |\n", + "+----------------+---------+\n", + "| loc.phys.hydro | 11 |\n", + "+----------------+---------+\n", + "| org.adm | 36 |\n", + "+----------------+---------+\n", + "| org.ent | 43 |\n", + "+----------------+---------+\n", + "| pers | 12 |\n", + "+----------------+---------+\n", + "| pers.coll | 8 |\n", + "+----------------+---------+\n", + "| pers.ind | 327 |\n", + "+----------------+---------+\n", + "nested\n", + "+--------------+---------+\n", + "| | count |\n", + "+==============+=========+\n", + "| loc.adm.town | 10 |\n", + "+--------------+---------+\n", + "| loc.phys.geo | 1 |\n", + "+--------------+---------+\n", + "| pers.ind | 1 |\n", + "+--------------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=letemps_test_fr_docs))" + ] + }, { "cell_type": "markdown", "id": "ed696674", @@ -1713,13 +2424,14 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 92, "id": "51fa12eb", "metadata": {}, "outputs": [], "source": [ "topRes19th_dev_en_path = os.path.join(HIPE2022_data_path, f\"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-dev-en.tsv\")\n", - "topRes19th_train_en_path = os.path.join(HIPE2022_data_path, f\"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-train-en.tsv\")" + "topRes19th_train_en_path = os.path.join(HIPE2022_data_path, f\"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-train-en.tsv\")\n", + "#topRes19th_test_en_path = os.path.join(HIPE2022_data_path, f\"topres19th/en/HIPE-2022-{RELEASE_VERSION}-topres19th-test-en.tsv\")" ] }, { @@ -1732,7 +2444,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 93, "id": "2457dbf9", "metadata": {}, "outputs": [], @@ -1743,7 +2455,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 94, "id": "e1fa7ece", "metadata": {}, "outputs": [ @@ -1785,7 +2497,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 95, "id": "8094673a", "metadata": {}, "outputs": [], @@ -1796,7 +2508,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 96, "id": "fc46b0f1", "metadata": {}, "outputs": [ @@ -1828,6 +2540,38 @@ "print(describe_dataset(documents=topRes19th_train_en_docs))" ] }, + { + "cell_type": "markdown", + "id": "d90cc1a2", + "metadata": {}, + "source": [ + "#### Test\n", + "\n", + "(lines below are to be commented out once the test files for topRes are published)" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "f46d26bc", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "#topRes19th_test_en_docs = parse_tsv(file_path=topRes19th_test_en_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "00abdc89", + "metadata": {}, + "outputs": [], + "source": [ + "# print some basic stats for the TSV dataset \n", + "#print(describe_dataset(documents=topRes19th_dev_en_docs))" + ] + }, { "cell_type": "markdown", "id": "9abfd336", @@ -1848,7 +2592,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 99, "id": "24c4cf39", "metadata": {}, "outputs": [], @@ -1856,18 +2600,22 @@ "# FR\n", "newseye_dev_fr_path = os.path.join(HIPE2022_data_path, f\"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fr.tsv\")\n", "newseye_train_fr_path = os.path.join(HIPE2022_data_path, f\"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-train-fr.tsv\")\n", + "newseye_test_fr_path = os.path.join(HIPE2022_data_path, f\"newseye/fr/HIPE-2022-{RELEASE_VERSION}-newseye-test-fr.tsv\")\n", "\n", "# DE\n", "newseye_dev_de_path = os.path.join(HIPE2022_data_path, f\"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-dev-de.tsv\")\n", "newseye_train_de_path = os.path.join(HIPE2022_data_path, f\"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-train-de.tsv\")\n", + "newseye_test_de_path = os.path.join(HIPE2022_data_path, f\"newseye/de/HIPE-2022-{RELEASE_VERSION}-newseye-test-de.tsv\")\n", "\n", "# FI\n", "newseye_dev_fi_path = os.path.join(HIPE2022_data_path, f\"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-dev-fi.tsv\")\n", "newseye_train_fi_path = os.path.join(HIPE2022_data_path, f\"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-train-fi.tsv\")\n", + "newseye_test_fi_path = os.path.join(HIPE2022_data_path, f\"newseye/fi/HIPE-2022-{RELEASE_VERSION}-newseye-test-fi.tsv\")\n", "\n", "# SV\n", "newseye_dev_sv_path = os.path.join(HIPE2022_data_path, f\"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-dev-sv.tsv\")\n", - "newseye_train_sv_path = os.path.join(HIPE2022_data_path, f\"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-train-sv.tsv\")" + "newseye_train_sv_path = os.path.join(HIPE2022_data_path, f\"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-train-sv.tsv\")\n", + "newseye_test_sv_path = os.path.join(HIPE2022_data_path, f\"newseye/sv/HIPE-2022-{RELEASE_VERSION}-newseye-test-sv.tsv\")" ] }, { @@ -1888,7 +2636,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 100, "id": "35e7be47", "metadata": {}, "outputs": [], @@ -1899,7 +2647,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 101, "id": "c0093f53", "metadata": {}, "outputs": [ @@ -1922,25 +2670,119 @@ "+-----------+---------+\n", "| ORG | 113 |\n", "+-----------+---------+\n", - "| PER | 293 |\n", + "| PER | 293 |\n", + "+-----------+---------+\n", + "fine_lit\n", + "+------------+---------+\n", + "| | count |\n", + "+============+=========+\n", + "| PER.author | 3 |\n", + "+------------+---------+\n", + "nested\n", + "+-----------+---------+\n", + "| | count |\n", + "+===========+=========+\n", + "| HumanProd | 1 |\n", + "+-----------+---------+\n", + "| LOC | 18 |\n", + "+-----------+---------+\n", + "| ORG | 7 |\n", + "+-----------+---------+\n", + "| PER | 6 |\n", + "+-----------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=newseye_dev_fr_docs))" + ] + }, + { + "cell_type": "markdown", + "id": "f527b0d3", + "metadata": {}, + "source": [ + "#### Train" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "541b4c65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv'" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "newseye_train_fr_path" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "7b4cd97a", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "newseye_train_fr_docs = parse_tsv(file_path=newseye_train_fr_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "4cdc73ab", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv \n", + "Number of documents: 35 \n", + "Number of entities: {'coarse_lit': 10423, 'fine_lit': 99, 'nested': 522} \n", + "Number of tokens: 255165 \n", + "Entity breakdown by type: coarse_lit\n", + "+-----------+---------+\n", + "| | count |\n", + "+===========+=========+\n", + "| HumanProd | 200 |\n", + "+-----------+---------+\n", + "| LOC | 4055 |\n", + "+-----------+---------+\n", + "| ORG | 1285 |\n", + "+-----------+---------+\n", + "| PER | 4883 |\n", "+-----------+---------+\n", "fine_lit\n", "+------------+---------+\n", "| | count |\n", "+============+=========+\n", - "| PER.author | 3 |\n", + "| PER.author | 99 |\n", "+------------+---------+\n", "nested\n", "+-----------+---------+\n", "| | count |\n", "+===========+=========+\n", - "| HumanProd | 1 |\n", + "| HumanProd | 6 |\n", "+-----------+---------+\n", - "| LOC | 18 |\n", + "| LOC | 263 |\n", "+-----------+---------+\n", - "| ORG | 7 |\n", + "| ORG | 212 |\n", "+-----------+---------+\n", - "| PER | 6 |\n", + "| PER | 41 |\n", "+-----------+---------+\n", "\n" ] @@ -1948,53 +2790,32 @@ ], "source": [ "# print some basic stats for the TSV dataset \n", - "print(describe_dataset(documents=newseye_dev_fr_docs))" + "print(describe_dataset(documents=newseye_train_fr_docs))" ] }, { "cell_type": "markdown", - "id": "f527b0d3", - "metadata": {}, - "source": [ - "#### Train" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "541b4c65", + "id": "8e67dd9d", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv'" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "newseye_train_fr_path" + "#### Test" ] }, { "cell_type": "code", - "execution_count": 42, - "id": "7b4cd97a", + "execution_count": 105, + "id": "a41bd5ac", "metadata": {}, "outputs": [], "source": [ "# parse the TSV into a list of `HipeDocument` objects\n", - "newseye_train_fr_docs = parse_tsv(file_path=newseye_train_fr_path)" + "newseye_test_fr_docs = parse_tsv(file_path=newseye_test_fr_path)" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "4cdc73ab", + "execution_count": 106, + "id": "42ab4bd5", "metadata": {}, "outputs": [ { @@ -2002,39 +2823,39 @@ "output_type": "stream", "text": [ "\n", - "Path of the TSV file: ./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-train-fr.tsv \n", + "Path of the TSV file: ./data/v2.1/newseye/fr/HIPE-2022-v2.1-newseye-test-fr.tsv \n", "Number of documents: 35 \n", - "Number of entities: {'coarse_lit': 10423, 'fine_lit': 99, 'nested': 522} \n", - "Number of tokens: 255165 \n", + "Number of entities: {'coarse_lit': 2530, 'fine_lit': 34, 'nested': 136} \n", + "Number of tokens: 70794 \n", "Entity breakdown by type: coarse_lit\n", "+-----------+---------+\n", "| | count |\n", "+===========+=========+\n", - "| HumanProd | 200 |\n", + "| HumanProd | 33 |\n", "+-----------+---------+\n", - "| LOC | 4055 |\n", + "| LOC | 1112 |\n", "+-----------+---------+\n", - "| ORG | 1285 |\n", + "| ORG | 360 |\n", "+-----------+---------+\n", - "| PER | 4883 |\n", + "| PER | 1025 |\n", "+-----------+---------+\n", "fine_lit\n", "+------------+---------+\n", "| | count |\n", "+============+=========+\n", - "| PER.author | 99 |\n", + "| PER.author | 34 |\n", "+------------+---------+\n", "nested\n", "+-----------+---------+\n", "| | count |\n", "+===========+=========+\n", - "| HumanProd | 6 |\n", + "| HumanProd | 5 |\n", "+-----------+---------+\n", - "| LOC | 263 |\n", + "| LOC | 85 |\n", "+-----------+---------+\n", - "| ORG | 212 |\n", + "| ORG | 31 |\n", "+-----------+---------+\n", - "| PER | 41 |\n", + "| PER | 15 |\n", "+-----------+---------+\n", "\n" ] @@ -2042,7 +2863,7 @@ ], "source": [ "# print some basic stats for the TSV dataset \n", - "print(describe_dataset(documents=newseye_train_fr_docs))" + "print(describe_dataset(documents=newseye_test_fr_docs))" ] }, { @@ -2063,7 +2884,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 107, "id": "9b6ef18f", "metadata": {}, "outputs": [], @@ -2074,7 +2895,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 108, "id": "3da94913", "metadata": {}, "outputs": [ @@ -2134,7 +2955,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 109, "id": "e0c65881", "metadata": {}, "outputs": [], @@ -2145,7 +2966,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 110, "id": "43791e9a", "metadata": {}, "outputs": [ @@ -2197,6 +3018,79 @@ "print(describe_dataset(documents=newseye_train_de_docs))" ] }, + { + "cell_type": "markdown", + "id": "b9b6ce3a", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "e59d42ca", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "newseye_test_de_docs = parse_tsv(file_path=newseye_test_de_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "49cacfd5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/newseye/de/HIPE-2022-v2.1-newseye-test-de.tsv \n", + "Number of documents: 13 \n", + "Number of entities: {'coarse_lit': 2401, 'fine_lit': 13, 'nested': 100} \n", + "Number of tokens: 99775 \n", + "Entity breakdown by type: coarse_lit\n", + "+-----------+---------+\n", + "| | count |\n", + "+===========+=========+\n", + "| HumanProd | 15 |\n", + "+-----------+---------+\n", + "| LOC | 1222 |\n", + "+-----------+---------+\n", + "| ORG | 353 |\n", + "+-----------+---------+\n", + "| PER | 811 |\n", + "+-----------+---------+\n", + "fine_lit\n", + "+------------+---------+\n", + "| | count |\n", + "+============+=========+\n", + "| PER.author | 13 |\n", + "+------------+---------+\n", + "nested\n", + "+-----------+---------+\n", + "| | count |\n", + "+===========+=========+\n", + "| HumanProd | 2 |\n", + "+-----------+---------+\n", + "| LOC | 30 |\n", + "+-----------+---------+\n", + "| ORG | 29 |\n", + "+-----------+---------+\n", + "| PER | 39 |\n", + "+-----------+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=newseye_test_de_docs))" + ] + }, { "cell_type": "markdown", "id": "872b6a68", @@ -2215,7 +3109,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 113, "id": "a6550817", "metadata": {}, "outputs": [], @@ -2226,7 +3120,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 114, "id": "2ef3872d", "metadata": {}, "outputs": [ @@ -2286,7 +3180,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 115, "id": "d183a069", "metadata": {}, "outputs": [], @@ -2297,7 +3191,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 116, "id": "06e61ccb", "metadata": {}, "outputs": [ @@ -2349,6 +3243,77 @@ "print(describe_dataset(documents=newseye_train_fi_docs))" ] }, + { + "cell_type": "markdown", + "id": "8585309d", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "7d4e29a0", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "newseye_test_fi_docs = parse_tsv(file_path=newseye_test_fi_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "id": "d37bb572", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/newseye/fi/HIPE-2022-v2.1-newseye-test-fi.tsv \n", + "Number of documents: 24 \n", + "Number of entities: {'coarse_lit': 691, 'fine_lit': 7, 'nested': 42} \n", + "Number of tokens: 14963 \n", + "Entity breakdown by type: coarse_lit\n", + "+-----------+---------+\n", + "| | count |\n", + "+===========+=========+\n", + "| HumanProd | 22 |\n", + "+-----------+---------+\n", + "| LOC | 262 |\n", + "+-----------+---------+\n", + "| ORG | 54 |\n", + "+-----------+---------+\n", + "| PER | 353 |\n", + "+-----------+---------+\n", + "fine_lit\n", + "+------------+---------+\n", + "| | count |\n", + "+============+=========+\n", + "| PER.author | 7 |\n", + "+------------+---------+\n", + "nested\n", + "+-----+---------+\n", + "| | count |\n", + "+=====+=========+\n", + "| LOC | 29 |\n", + "+-----+---------+\n", + "| ORG | 10 |\n", + "+-----+---------+\n", + "| PER | 3 |\n", + "+-----+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=newseye_test_fi_docs))" + ] + }, { "cell_type": "markdown", "id": "74083291", @@ -2367,7 +3332,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 119, "id": "266f8bee", "metadata": {}, "outputs": [], @@ -2378,7 +3343,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 120, "id": "42c4a49a", "metadata": {}, "outputs": [ @@ -2438,7 +3403,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 121, "id": "cc962b4f", "metadata": {}, "outputs": [], @@ -2449,7 +3414,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 122, "id": "b6f6f2e4", "metadata": {}, "outputs": [ @@ -2501,6 +3466,71 @@ "print(describe_dataset(documents=newseye_train_sv_docs))" ] }, + { + "cell_type": "markdown", + "id": "89d1ad9a", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "40406c49", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "newseye_test_sv_docs = parse_tsv(file_path=newseye_test_sv_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "018bf812", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/newseye/sv/HIPE-2022-v2.1-newseye-test-sv.tsv \n", + "Number of documents: 21 \n", + "Number of entities: {'coarse_lit': 604, 'nested': 26} \n", + "Number of tokens: 16162 \n", + "Entity breakdown by type: coarse_lit\n", + "+-----------+---------+\n", + "| | count |\n", + "+===========+=========+\n", + "| HumanProd | 22 |\n", + "+-----------+---------+\n", + "| LOC | 313 |\n", + "+-----------+---------+\n", + "| ORG | 60 |\n", + "+-----------+---------+\n", + "| PER | 209 |\n", + "+-----------+---------+\n", + "nested\n", + "+-----+---------+\n", + "| | count |\n", + "+=====+=========+\n", + "| LOC | 20 |\n", + "+-----+---------+\n", + "| ORG | 2 |\n", + "+-----+---------+\n", + "| PER | 4 |\n", + "+-----+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=newseye_test_sv_docs))" + ] + }, { "cell_type": "markdown", "id": "170e049d", @@ -2521,12 +3551,13 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 125, "id": "263a7947", "metadata": {}, "outputs": [], "source": [ - "sonar_dev_de_path = os.path.join(HIPE2022_data_path, f\"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-dev-de.tsv\")" + "sonar_dev_de_path = os.path.join(HIPE2022_data_path, f\"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-dev-de.tsv\")\n", + "sonar_test_de_path = os.path.join(HIPE2022_data_path, f\"sonar/de/HIPE-2022-{RELEASE_VERSION}-sonar-test-de.tsv\")" ] }, { @@ -2539,7 +3570,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 126, "id": "d5ac5fcb", "metadata": {}, "outputs": [], @@ -2550,7 +3581,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 127, "id": "fba9bf96", "metadata": {}, "outputs": [ @@ -2562,7 +3593,7 @@ "Path of the TSV file: ./data/v2.1/sonar/de/HIPE-2022-v2.1-sonar-dev-de.tsv \n", "Number of documents: 10 \n", "Number of entities: {'coarse_lit': 654} \n", - "Number of tokens: 17564 \n", + "Number of tokens: 17476 \n", "Entity breakdown by type: coarse_lit\n", "+-----+---------+\n", "| | count |\n", @@ -2581,6 +3612,59 @@ "# print some basic stats for the TSV dataset \n", "print(describe_dataset(documents=sonar_dev_de_docs))" ] + }, + { + "cell_type": "markdown", + "id": "c5c90f0d", + "metadata": {}, + "source": [ + "#### Test" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "id": "b9081feb", + "metadata": {}, + "outputs": [], + "source": [ + "# parse the TSV into a list of `HipeDocument` objects\n", + "sonar_test_de_docs = parse_tsv(file_path=sonar_test_de_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "id": "7b84855f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Path of the TSV file: ./data/v2.1/sonar/de/HIPE-2022-v2.1-sonar-test-de.tsv \n", + "Number of documents: 10 \n", + "Number of entities: {'coarse_lit': 471} \n", + "Number of tokens: 15463 \n", + "Entity breakdown by type: coarse_lit\n", + "+-----+---------+\n", + "| | count |\n", + "+=====+=========+\n", + "| LOC | 177 |\n", + "+-----+---------+\n", + "| ORG | 111 |\n", + "+-----+---------+\n", + "| PER | 183 |\n", + "+-----+---------+\n", + "\n" + ] + } + ], + "source": [ + "# print some basic stats for the TSV dataset \n", + "print(describe_dataset(documents=sonar_test_de_docs))" + ] } ], "metadata": {