-
Notifications
You must be signed in to change notification settings - Fork 3
/
dataset_citations.bib
1178 lines (1095 loc) · 42.6 KB
/
dataset_citations.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% In anthology.bib %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Add
@inproceedings{el-haj-etal-2018-arabic,
title = "{A}rabic Dialect Identification in the Context of Bivalency and Code-Switching",
author = "El-Haj, Mahmoud and
Rayson, Paul and
Aboelezz, Mariam",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1573",
}
% AfroMAFT
@inproceedings{adelani-etal-2022-thousand,
title = "A Few Thousand Translations Go a Long Way! Leveraging Pre-trained Models for {A}frican News Translation",
author = "Adelani, David and
Alabi, Jesujoba and
Fan, Angela and
Kreutzer, Julia and
Shen, Xiaoyu and
Reid, Machel and
Ruiter, Dana and
Klakow, Dietrich and
Nabende, Peter and
Chang, Ernie and
Gwadabe, Tajuddeen and
Sackey, Freshia and
Dossou, Bonaventure F. P. and
Emezue, Chris and
Leong, Colin and
Beukman, Michael and
Muhammad, Shamsuddeen and
Jarso, Guyo and
Yousuf, Oreen and
Niyongabo Rubungo, Andre and
Hacheme, Gilles and
Wairagala, Eric Peter and
Nasir, Muhammad Umair and
Ajibade, Benjamin and
Ajayi, Tunde and
Gitau, Yvonne and
Abbott, Jade and
Ahmed, Mohamed and
Ochieng, Millicent and
Aremu, Anuoluwapo and
Ogayo, Perez and
Mukiibi, Jonathan and
Ouoba Kabore, Fatoumata and
Kalipe, Godson and
Mbaye, Derguene and
Tapo, Allahsera Auguste and
Memdjokam Koagne, Victoire and
Munkoh-Buabeng, Edwin and
Wagner, Valencia and
Abdulmumin, Idris and
Awokoya, Ayodele and
Buzaaba, Happy and
Sibanda, Blessing and
Bukula, Andiswa and
Manthalu, Sam",
booktitle = "Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.naacl-main.223",
doi = "10.18653/v1/2022.naacl-main.223",
pages = "3053--3070",
}
@inproceedings{xue-etal-2021-mt5,
title = "m{T}5: A Massively Multilingual Pre-trained Text-to-Text Transformer",
author = "Xue, Linting and
Constant, Noah and
Roberts, Adam and
Kale, Mihir and
Al-Rfou, Rami and
Siddhant, Aditya and
Barua, Aditya and
Raffel, Colin",
booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.naacl-main.41",
doi = "10.18653/v1/2021.naacl-main.41",
pages = "483--498",
}
% AraBench
@inproceedings{sajjad-etal-2020-arabench,
title = "{A}ra{B}ench: Benchmarking Dialectal {A}rabic-{E}nglish Machine Translation",
author = "Sajjad, Hassan and
Abdelali, Ahmed and
Durrani, Nadir and
Dalvi, Fahim",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2020.coling-main.447",
doi = "10.18653/v1/2020.coling-main.447",
pages = "5094--5107",
}
% CC100
@inproceedings{conneau-etal-2020-unsupervised,
title = "Unsupervised Cross-lingual Representation Learning at Scale",
author = "Conneau, Alexis and
Khandelwal, Kartikay and
Goyal, Naman and
Chaudhary, Vishrav and
Wenzek, Guillaume and
Guzm{\'a}n, Francisco and
Grave, Edouard and
Ott, Myle and
Zettlemoyer, Luke and
Stoyanov, Veselin",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.747",
doi = "10.18653/v1/2020.acl-main.747",
pages = "8440--8451",
}
% CCNet
@inproceedings{wenzek-etal-2020-ccnet,
title = "{CCN}et: Extracting High Quality Monolingual Datasets from Web Crawl Data",
author = "Wenzek, Guillaume and
Lachaux, Marie-Anne and
Conneau, Alexis and
Chaudhary, Vishrav and
Guzm{\'a}n, Francisco and
Joulin, Armand and
Grave, Edouard",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.494",
pages = "4003--4012",
language = "English",
ISBN = "979-10-95546-34-4",
}
% DART
@inproceedings{alsarsour-etal-2018-dart,
title = "{DART}: A Large Dataset of Dialectal {A}rabic Tweets",
author = "Alsarsour, Israa and
Mohamed, Esraa and
Suwaileh, Reem and
Elsayed, Tamer",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1579",
}
% GiossaMedia
@inproceedings{gongora-etal-2022-use,
title = "Can We Use Word Embeddings for Enhancing {G}uarani-{S}panish Machine Translation?",
author = "G{\'o}ngora, Santiago and
Giossa, Nicol{\'a}s and
Chiruzzo, Luis",
booktitle = "Proceedings of the Fifth Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.computel-1.16",
doi = "10.18653/v1/2022.computel-1.16",
pages = "127--132",
}
@inproceedings{gongora-etal-2021-experiments,
title = "Experiments on a {G}uarani Corpus of News and Social Media",
author = "G{\'o}ngora, Santiago and
Giossa, Nicol{\'a}s and
Chiruzzo, Luis",
booktitle = "Proceedings of the First Workshop on Natural Language Processing for Indigenous Languages of the Americas",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.americasnlp-1.16",
doi = "10.18653/v1/2021.americasnlp-1.16",
pages = "153--158",
}
% Glosses
@inproceedings{camacho-collados-etal-2016-large,
title = "A Large-Scale Multilingual Disambiguation of Glosses",
author = "Camacho-Collados, Jos{\'e} and
Delli Bovi, Claudio and
Raganato, Alessandro and
Navigli, Roberto",
booktitle = "Proceedings of the Tenth International Conference on Language Resources and Evaluation ({LREC}'16)",
month = may,
year = "2016",
address = "Portoro{\v{z}}, Slovenia",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L16-1269",
pages = "1701--1708",
}
% Habibi
@inproceedings{el-haj-2020-habibi,
title = "Habibi - a multi Dialect multi National {A}rabic Song Lyrics Corpus",
author = "El-Haj, Mahmoud",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.165",
pages = "1318--1326",
language = "English",
ISBN = "979-10-95546-34-4",
}
% IITB
@inproceedings{kunchukuttan-etal-2018-iit,
title = "The {IIT} {B}ombay {E}nglish-{H}indi Parallel Corpus",
author = "Kunchukuttan, Anoop and
Mehta, Pratik and
Bhattacharyya, Pushpak",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1548",
}
% IndicNLP
@inproceedings{nakazawa-etal-2021-overview,
title = "Overview of the 8th Workshop on {A}sian Translation",
author = "Nakazawa, Toshiaki and
Nakayama, Hideki and
Ding, Chenchen and
Dabre, Raj and
Higashiyama, Shohei and
Mino, Hideya and
Goto, Isao and
Pa Pa, Win and
Kunchukuttan, Anoop and
Parida, Shantipriya and
Bojar, Ond{\v{r}}ej and
Chu, Chenhui and
Eriguchi, Akiko and
Abe, Kaori and
Oda, Yusuke and
Kurohashi, Sadao",
booktitle = "Proceedings of the 8th Workshop on Asian Translation (WAT2021)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wat-1.1",
doi = "10.18653/v1/2021.wat-1.1",
pages = "1--45",
}
% Indiccorp
@inproceedings{kakwani-etal-2020-indicnlpsuite,
title = "{I}ndic{NLPS}uite: Monolingual Corpora, Evaluation Benchmarks and Pre-trained Multilingual Language Models for {I}ndian Languages",
author = "Kakwani, Divyanshu and
Kunchukuttan, Anoop and
Golla, Satish and
N.C., Gokul and
Bhattacharyya, Avik and
Khapra, Mitesh M. and
Kumar, Pratyush",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.445",
doi = "10.18653/v1/2020.findings-emnlp.445",
pages = "4948--4961",
}
% JoshuaDec
@inproceedings{post-etal-2012-constructing,
title = "Constructing Parallel Corpora for Six {I}ndian Languages via Crowdsourcing",
author = "Post, Matt and
Callison-Burch, Chris and
Osborne, Miles",
booktitle = "Proceedings of the Seventh Workshop on Statistical Machine Translation",
month = jun,
year = "2012",
address = "Montr{\'e}al, Canada",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W12-3152",
pages = "401--409",
}
% JESC
@inproceedings{pryzant-etal-2018-jesc,
title = "{JESC}: {J}apanese-{E}nglish Subtitle Corpus",
author = "Pryzant, Reid and
Chung, Youngjoo and
Jurafsky, Dan and
Britz, Denny",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1182",
}
% JParaCrawl
@inproceedings{morishita-etal-2020-jparacrawl,
title = "{JP}ara{C}rawl: A Large Scale Web-Based {E}nglish-{J}apanese Parallel Corpus",
author = "Morishita, Makoto and
Suzuki, Jun and
Nagata, Masaaki",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.443",
pages = "3603--3609",
language = "English",
ISBN = "979-10-95546-34-4",
}
% MTData
@inproceedings{gowda-etal-2021-many,
title = "Many-to-{E}nglish Machine Translation Tools, Data, and Pretrained Models",
author = "Gowda, Thamme and
Zhang, Zhao and
Mattmann, Chris and
May, Jonathan",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing: System Demonstrations",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-demo.37",
doi = "10.18653/v1/2021.acl-demo.37",
pages = "306--316",
}
% Menyo20K
@inproceedings{adelani-etal-2021-effect,
title = "The Effect of Domain and Diacritics in {Y}oruba{--}{E}nglish Neural Machine Translation",
author = "Adelani, David and
Ruiter, Dana and
Alabi, Jesujoba and
Adebonojo, Damilola and
Ayeni, Adesina and
Adeyemi, Mofe and
Awokoya, Ayodele Esther and
Espa{\~n}a-Bonet, Cristina",
booktitle = "Proceedings of Machine Translation Summit XVIII: Research Track",
month = aug,
year = "2021",
address = "Virtual",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2021.mtsummit-research.6",
pages = "61--75",
}
% Minangkabau corpora
@inproceedings{koto-koto-2020-towards,
title = "Towards Computational Linguistics in {M}inangkabau Language: Studies on Sentiment Analysis and Machine Translation",
author = "Koto, Fajri and
Koto, Ikhwan",
booktitle = "Proceedings of the 34th Pacific Asia Conference on Language, Information and Computation",
month = oct,
year = "2020",
address = "Hanoi, Vietnam",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.paclic-1.17",
pages = "138--148",
}
% MoT
@inproceedings{palen-michel-etal-2022-multilingual,
title = "Multilingual Open Text Release 1: Public Domain News in 44 Languages",
author = "Palen-Michel, Chester and
Kim, June and
Lignos, Constantine",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.224",
pages = "2080--2089",
}
% ParaCrawl
@inproceedings{banon-etal-2020-paracrawl,
title = "{P}ara{C}rawl: Web-Scale Acquisition of Parallel Corpora",
author = "Ba{\~n}{\'o}n, Marta and
Chen, Pinzhen and
Haddow, Barry and
Heafield, Kenneth and
Hoang, Hieu and
Espl{\`a}-Gomis, Miquel and
Forcada, Mikel L. and
Kamran, Amir and
Kirefu, Faheem and
Koehn, Philipp and
Ortiz Rojas, Sergio and
Pla Sempere, Leopoldo and
Ram{\'\i}rez-S{\'a}nchez, Gema and
Sarr{\'\i}as, Elsa and
Strelec, Marek and
Thompson, Brian and
Waites, William and
Wiggins, Dion and
Zaragoza, Jaume",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.417",
doi = "10.18653/v1/2020.acl-main.417",
pages = "4555--4567",
}
% PBC
@inproceedings{mayer-cysouw-2014-creating,
title = "Creating a massively parallel {B}ible corpus",
author = "Mayer, Thomas and
Cysouw, Michael",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/220_Paper.pdf",
pages = "3158--3163",
abstract = "We present our ongoing effort to create a massively parallel Bible corpus. While an ever-increasing number of Bible translations is available in electronic form on the internet, there is no large-scale parallel Bible corpus that allows language researchers to easily get access to the texts and their parallel structure for a large variety of different languages. We report on the current status of the corpus, with over 900 translations in more than 830 language varieties. All translations are tokenized (e.g., separating punctuation marks) and Unicode normalized. Mainly due to copyright restrictions only portions of the texts are made publicly available. However, we provide co-occurrence information for each translation in a (sparse) matrix format. All word forms in the translation are given together with their frequency and the verses in which they occur.",
}
% Parallel Corpora for Ethiopian Languages
@inproceedings{abate-etal-2018-parallel,
title = "Parallel Corpora for bi-lingual {E}nglish-{E}thiopian Languages Statistical Machine Translation",
author = "Abate, Solomon Teferra and
Melese, Michael and
Tachbelie, Martha Yifiru and
Meshesha, Million and
Atinafu, Solomon and
Mulugeta, Wondwossen and
Assabie, Yaregal and
Abera, Hafte and
Ephrem, Binyam and
Abebe, Tewodros and
Tsegaye, Wondimagegnhue and
Lemma, Amanuel and
Andargie, Tsegaye and
Shifaw, Seifedin",
booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
month = aug,
year = "2018",
address = "Santa Fe, New Mexico, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/C18-1262",
pages = "3102--3111",
}
% QADI
@inproceedings{abdelali-etal-2021-qadi,
title = "{QADI}: {A}rabic Dialect Identification in the Wild",
author = "Abdelali, Ahmed and
Mubarak, Hamdy and
Samih, Younes and
Hassan, Sabit and
Darwish, Kareem",
booktitle = "Proceedings of the Sixth Arabic Natural Language Processing Workshop",
month = apr,
year = "2021",
address = "Kyiv, Ukraine (Virtual)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.wanlp-1.1",
pages = "1--10",
}
% Quechua-IIC
@inproceedings{zevallos-etal-2022-introducing,
title = "Introducing {Q}u{BERT}: A Large Monolingual Corpus and {BERT} Model for {S}outhern {Q}uechua",
author = "Zevallos, Rodolfo and
Ortega, John and
Chen, William and
Castro, Richard and
Bel, N{\'u}ria and
Toshio, Cesar and
Venturas, Renzo and
Aradiel, Hilario and
Melgarejo, Nelsi",
booktitle = "Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing",
month = jul,
year = "2022",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2022.deeplo-1.1",
doi = "10.18653/v1/2022.deeplo-1.1",
pages = "1--13",
}
% Shami
@inproceedings{abu-kwaik-etal-2018-shami,
title = "{S}hami: A Corpus of {L}evantine {A}rabic Dialects",
author = "Abu Kwaik, Kathrein and
Saad, Motaz and
Chatzikyriakidis, Stergios and
Dobnik, Simon",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1576",
}
% SLI\_GalWeb.1.0
@inproceedings{agerri-etal-2018-developing,
title = "Developing New Linguistic Resources and Tools for the {G}alician Language",
author = "Agerri, Rodrigo and
G{\'o}mez Guinovart, Xavier and
Rigau, German and
Solla Portela, Miguel Anxo",
booktitle = "Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)",
month = may,
year = "2018",
address = "Miyazaki, Japan",
publisher = "European Language Resources Association (ELRA)",
url = "https://aclanthology.org/L18-1367",
}
% TeDDi
@inproceedings{moran-etal-2022-teddi,
title = "{T}e{DD}i Sample: Text Data Diversity Sample for Language Comparison and Multilingual {NLP}",
author = "Moran, Steven and
Bentz, Christian and
Gutierrez-Vasques, Ximena and
Pelloni, Olga and
Samardzic, Tanja",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2022.lrec-1.123",
pages = "1150--1158",
}
% TICO
@inproceedings{anastasopoulos-etal-2020-tico,
title = "{TICO}-19: the Translation Initiative for {CO}vid-19",
author = {Anastasopoulos, Antonios and
Cattelan, Alessandro and
Dou, Zi-Yi and
Federico, Marcello and
Federmann, Christian and
Genzel, Dmitriy and
Guzm{\'a}n, Franscisco and
Hu, Junjie and
Hughes, Macduff and
Koehn, Philipp and
Lazar, Rosie and
Lewis, Will and
Neubig, Graham and
Niu, Mengmeng and
{\"O}ktem, Alp and
Paquin, Eric and
Tang, Grace and
Tur, Sylwia},
booktitle = "Proceedings of the 1st Workshop on {NLP} for {COVID}-19 (Part 2) at {EMNLP} 2020",
month = dec,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.nlpcovid19-2.5",
doi = "10.18653/v1/2020.nlpcovid19-2.5",
}
% TIL
@inproceedings{mirzakhalov-etal-2021-large,
title = "A Large-Scale Study of Machine Translation in {T}urkic Languages",
author = "Mirzakhalov, Jamshidbek and
Babu, Anoop and
Ataman, Duygu and
Kariev, Sherzod and
Tyers, Francis and
Abduraufov, Otabek and
Hajili, Mammad and
Ivanova, Sardana and
Khaytbaev, Abror and
Laverghetta Jr., Antonio and
Moydinboyev, Bekhzodbek and
Onal, Esra and
Pulatova, Shaxnoza and
Wahab, Ahsan and
Firat, Orhan and
Chellappan, Sriram",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.emnlp-main.475",
doi = "10.18653/v1/2021.emnlp-main.475",
pages = "5876--5890",
}
% Tilde
@inproceedings{rozis-skadins-2017-tilde,
title = "Tilde {MODEL} - Multilingual Open Data for {EU} Languages",
author = "Rozis, Roberts and
Skadi{\c{n}}{\v{s}}, Raivis",
booktitle = "Proceedings of the 21st Nordic Conference on Computational Linguistics",
month = may,
year = "2017",
address = "Gothenburg, Sweden",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-0235",
pages = "263--265",
}
% WAT 2020
@inproceedings{nakazawa-etal-2022-overview,
title = "Overview of the 9th Workshop on {A}sian Translation",
author = "Nakazawa, Toshiaki and
Mino, Hideya and
Goto, Isao and
Dabre, Raj and
Higashiyama, Shohei and
Parida, Shantipriya and
Kunchukuttan, Anoop and
Morishita, Makoto and
Bojar, Ond{\v{r}}ej and
Chu, Chenhui and
Eriguchi, Akiko and
Abe, Kaori and
Oda, Yusuke and
Kurohashi, Sadao",
booktitle = "Proceedings of the 9th Workshop on Asian Translation",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Conference on Computational Linguistics",
url = "https://aclanthology.org/2022.wat-1.1",
pages = "1--36",
}
% WikiMatrix
@inproceedings{schwenk-etal-2021-wikimatrix,
title = "{W}iki{M}atrix: Mining 135{M} Parallel Sentences in 1620 Language Pairs from {W}ikipedia",
author = "Schwenk, Holger and
Chaudhary, Vishrav and
Sun, Shuo and
Gong, Hongyu and
Guzm{\'a}n, Francisco",
booktitle = "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
month = apr,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.eacl-main.115",
doi = "10.18653/v1/2021.eacl-main.115",
pages = "1351--1361",
}
% Workshop on NER for South and South East Asian Languages
@inproceedings{singh-2008-named,
title = "Named Entity Recognition for South and South {E}ast {A}sian Languages: Taking Stock",
author = "Singh, Anil Kumar",
booktitle = "Proceedings of the {IJCNLP}-08 Workshop on Named Entity Recognition for South and South East {A}sian Languages",
year = "2008",
url = "https://aclanthology.org/I08-5003",
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Not in anthology.bib %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% AfriBERTa
@inproceedings{ogueji2021small,
title={Small data? no problem! exploring the viability of pretrained multilingual language models for low-resourced languages},
author={Ogueji, Kelechi and Zhu, Yuxin and Lin, Jimmy},
booktitle={Proceedings of the 1st Workshop on Multilingual Representation Learning},
pages={116--126},
year={2021}
}
% Bianet
@inproceedings{ataman2018bianet,
title={Bianet: A Parallel News Corpus in Turkish, Kurdish and English},
author={Ataman, Duygu},
booktitle={LREC 2018 Workshop},
pages={14},
year={2018}
}
% BLOOM
@inproceedings{DBLP:conf/emnlp/LeongNMFOW22,
author = {Colin Leong and
Joshua Nemecek and
Jacob Mansdorfer and
Anna Filighera and
Abraham Owodunni and
Daniel Whitenack},
editor = {Yoav Goldberg and
Zornitsa Kozareva and
Yue Zhang},
title = {Bloom Library: Multimodal Datasets in 300+ Languages for a Variety
of Downstream Tasks},
booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural
Language Processing, {EMNLP} 2022, Abu Dhabi, United Arab Emirates,
December 7-11, 2022},
pages = {8608--8621},
publisher = {Association for Computational Linguistics},
year = {2022},
url = {https://aclanthology.org/2022.emnlp-main.590},
timestamp = {Tue, 07 Feb 2023 17:10:51 +0100},
biburl = {https://dblp.org/rec/conf/emnlp/LeongNMFOW22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
% CC100
@inproceedings{DBLP:conf/lrec/WenzekLCCGJG20,
author = {Guillaume Wenzek and
Marie{-}Anne Lachaux and
Alexis Conneau and
Vishrav Chaudhary and
Francisco Guzm{\'{a}}n and
Armand Joulin and
Edouard Grave},
editor = {Nicoletta Calzolari and
Fr{\'{e}}d{\'{e}}ric B{\'{e}}chet and
Philippe Blache and
Khalid Choukri and
Christopher Cieri and
Thierry Declerck and
Sara Goggi and
Hitoshi Isahara and
Bente Maegaard and
Joseph Mariani and
H{\'{e}}l{\`{e}}ne Mazo and
Asunci{\'{o}}n Moreno and
Jan Odijk and
Stelios Piperidis},
title = {CCNet: Extracting High Quality Monolingual Datasets from Web Crawl
Data},
booktitle = {Proceedings of The 12th Language Resources and Evaluation Conference,
{LREC} 2020, Marseille, France, May 11-16, 2020},
pages = {4003--4012},
publisher = {European Language Resources Association},
year = {2020},
url = {https://aclanthology.org/2020.lrec-1.494/},
timestamp = {Fri, 06 Aug 2021 00:40:04 +0200},
biburl = {https://dblp.org/rec/conf/lrec/WenzekLCCGJG20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
% Earthlings
@article{DBLP:journals/lre/Dunn20,
author = {Jonathan Dunn},
title = {Mapping languages: the Corpus of Global Language Use},
journal = {Lang. Resour. Evaluation},
volume = {54},
number = {4},
pages = {999--1018},
year = {2020},
url = {https://doi.org/10.1007/s10579-020-09489-2},
doi = {10.1007/s10579-020-09489-2},
timestamp = {Sun, 02 Oct 2022 15:43:16 +0200},
biburl = {https://dblp.org/rec/journals/lre/Dunn20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
% Flores200, NLLB_seed
@article{costa2022no,
title={No language left behind: Scaling human-centered machine translation},
author={Costa-juss{\`a}, Marta R and Cross, James and {\c{C}}elebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and others},
journal={arXiv preprint arXiv:2207.04672},
year={2022}
}
% HinDialect
@misc{bafna2022empirical,
title={Empirical Models for an Indic Language Continuum},
author={Bafna, Niyati},
year={2022},
publisher={Univerzita Karlova, Matematicko-fyzik{\'a}ln{\'\i} fakulta}
}
% LeipzigData
@inproceedings{DBLP:conf/lrec/GoldhahnEQ12,
author = {Dirk Goldhahn and
Thomas Eckart and
Uwe Quasthoff},
editor = {Nicoletta Calzolari and
Khalid Choukri and
Thierry Declerck and
Mehmet Ugur Dogan and
Bente Maegaard and
Joseph Mariani and
Jan Odijk and
Stelios Piperidis},
title = {Building Large Monolingual Dictionaries at the Leipzig Corpora Collection:
From 100 to 200 Languages},
booktitle = {Proceedings of the Eighth International Conference on Language Resources
and Evaluation, {LREC} 2012, Istanbul, Turkey, May 23-25, 2012},
pages = {759--765},
publisher = {European Language Resources Association {(ELRA)}},
year = {2012},
url = {http://www.lrec-conf.org/proceedings/lrec2012/summaries/327.html},
timestamp = {Mon, 19 Aug 2019 15:22:46 +0200},
biburl = {https://dblp.org/rec/conf/lrec/GoldhahnEQ12.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
% MaCoCu
@inproceedings{DBLP:conf/eamt/BanonEFGKLNSRRS22,
author = {Marta Ba{\~{n}}{\'{o}}n and
Miquel Espl{\`{a}}{-}Gomis and
Mikel L. Forcada and
Cristian Garc{\'{\i}}a{-}Romero and
Taja Kuzman and
Nikola Ljubesic and
Rik van Noord and
Leopoldo Pla Sempere and
Gema Ram{\'{\i}}rez{-}S{\'{a}}nchez and
Peter Rupnik and
V{\'{\i}}t Suchomel and
Antonio Toral and
Tobias van der Werff and
Jaume Zaragoza},
editor = {Helena Moniz and
Lieve Macken and
Andrew Rufener and
Lo{\"{\i}}c Barrault and
Marta R. Costa{-}juss{\`{a}} and
Christophe Declercq and
Maarit Koponen and
Ellie Kemp and
Spyridon Pilos and
Mikel L. Forcada and
Carolina Scarton and
Joachim Van den Bogaert and
Joke Daems and
Arda Tezcan and
Bram Vanroy and
Margot Fonteyne},
title = {MaCoCu: Massive collection and curation of monolingual and bilingual
data: focus on under-resourced languages},
booktitle = {Proceedings of the 23rd Annual Conference of the European Association
for Machine Translation, {EAMT} 2022, Ghent, Belgium, June 1-3, 2022},
pages = {301--302},
publisher = {European Association for Machine Translation},
year = {2022},
url = {https://aclanthology.org/2022.eamt-1.41},
timestamp = {Thu, 02 Jun 2022 08:16:43 +0200},
biburl = {https://dblp.org/rec/conf/eamt/BanonEFGKLNSRRS22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
% MC4
@article{DBLP:journals/jmlr/RaffelSRLNMZLL20,
author = {Colin Raffel and
Noam Shazeer and
Adam Roberts and
Katherine Lee and
Sharan Narang and
Michael Matena and
Yanqi Zhou and
Wei Li and
Peter J. Liu},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text
Transformer},
journal = {J. Mach. Learn. Res.},
volume = {21},
pages = {140:1--140:67},
year = {2020},
url = {http://jmlr.org/papers/v21/20-074.html},
timestamp = {Fri, 05 Feb 2021 15:43:41 +0100},
biburl = {https://dblp.org/rec/journals/jmlr/RaffelSRLNMZLL20.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
% OPUS
@inproceedings{TIEDEMANN12.463,
author = {Jörg Tiedemann},
title = {Parallel Data, Tools and Interfaces in OPUS},
booktitle = {Proceedings of the Eight International Conference on Language Resources and Evaluation (LREC'12)},
year = {2012},
month = {may},
date = {23-25},
address = {Istanbul, Turkey},
editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Mehmet Ugur Dogan and Bente Maegaard and Joseph Mariani and Jan Odijk and Stelios Piperidis},
publisher = {European Language Resources Association (ELRA)},
isbn = {978-2-9517408-7-7},
language = {english}
}
% OSCAR
@inproceedings{suarez2019asynchronous,
title={Asynchronous pipeline for processing huge corpora on medium to low resource infrastructures},
author={Su{\'a}rez, Pedro Javier Ortiz and Sagot, Beno{\^\i}t and Romary, Laurent},
booktitle={7th Workshop on the Challenges in the Management of Large Corpora (CMLC-7)},
year={2019},
organization={Leibniz-Institut f{\"u}r Deutsche Sprache}
}
% Phontron
@misc{neubig11kftt,
author = {Graham Neubig},
title = {The {Kyoto} Free Translation Task},
howpublished = {http://www.phontron.com/kftt},
year = {2011}
}
% W2C
@misc{11858/00-097C-0000-0022-6133-9,
title = {{W2C} – Web to Corpus – Corpora},
author = {Majli{\v s}, Martin},
url = {http://hdl.handle.net/11858/00-097C-0000-0022-6133-9},
note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University},
copyright = {Attribution-{ShareAlike} 3.0 Unported ({CC} {BY}-{SA} 3.0)},
year = {2011}
}
% XLSum
@inproceedings{DBLP:conf/acl/HasanBIMLKRS21,
author = {Tahmid Hasan and
Abhik Bhattacharjee and
Md. Saiful Islam and
Kazi Samin Mubasshir and
Yuan{-}Fang Li and
Yong{-}Bin Kang and
M. Sohel Rahman and
Rifat Shahriyar},
editor = {Chengqing Zong and
Fei Xia and
Wenjie Li and
Roberto Navigli},
title = {XL-Sum: Large-Scale Multilingual Abstractive Summarization for 44
Languages},
booktitle = {Findings of the Association for Computational Linguistics: {ACL/IJCNLP}
2021, Online Event, August 1-6, 2021},
series = {Findings of {ACL}},
volume = {{ACL/IJCNLP} 2021},
pages = {4693--4703},
publisher = {Association for Computational Linguistics},
year = {2021},
url = {https://doi.org/10.18653/v1/2021.findings-acl.413},
doi = {10.18653/v1/2021.findings-acl.413},
timestamp = {Fri, 15 Jul 2022 14:52:04 +0200},
biburl = {https://dblp.org/rec/conf/acl/HasanBIMLKRS21.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Websites %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% AI4Bharat,\footnote{\url{https://ai4bharat.org/}}
@misc{AI4Bharat2022,
title = {AI4Bharat},
note = {Accessed: 2022-11},
howpublished = {\url{https://ai4bharat.org/}}
}
% AIFORTHAI-LotusCorpus,\footnote{\url{https://github.com/korakot/corpus/releases/download/v1.0/AIFORTHAI-LotusCorpus.zip}}
@misc{AIFORTHAI2022,
title = {AIFORTHAI-LotusCorpus},
note = {Accessed: 2022-11},
howpublished = {\url{https://github.com/korakot/corpus/releases/download/v1.0/AIFORTHAI-LotusCorpus.zip}}
}
% Akuapem, \footnote{\url{https://zenodo.org/record/4432117#.Y00gXOxBw-Q}}
@misc{Akuapem2022,
title = {Akuapem},
note = {Accessed: 2022-11},
howpublished = {\url{https://zenodo.org/record/4432117#.Y00gXOxBw-Q}}
}
% Anuvaad,\footnote{\url{https://github.com/project-anuvaad/anuvaad-parallel-corpus}}
@misc{Anuvaad2022,
title = {Anuvaad},
note = {Accessed: 2022-11},
howpublished = {\url{https://github.com/project-anuvaad/anuvaad-parallel-corpus}}