-
Notifications
You must be signed in to change notification settings - Fork 191
/
test_document_minhash_deduplicator.py
963 lines (954 loc) · 71.5 KB
/
test_document_minhash_deduplicator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
import unittest
from data_juicer.core.data import NestedDataset as Dataset
from data_juicer.ops.deduplicator.document_minhash_deduplicator import \
DocumentMinhashDeduplicator
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
class DocumentMinhashDeduplicatorTest(DataJuicerTestCaseBase):
def _run_minhash_dedup(self, dataset: Dataset, target_list, op):
dataset = dataset.map(op.compute_hash)
dataset, _ = op.process(dataset)
dataset = dataset.select_columns(column_names=['text'])
res_list = dataset.to_list()
self.assertEqual(res_list, target_list)
def test_english_deduplication(self):
ds_list = [
{
'text': 'Today is Sunday and it\'s a happy day!'
},
{
'text': 'Do you need a cup of coffee?'
},
{
'text': 'Today is sunday and it\'s really a happy day!'
},
{
'text':
'This paper proposed a novel method on LLM pretraining.'
},
{
'text':
'Smithfield employs 3,700 people at its plant in Sioux Falls, '
'South Dakota. The plant slaughters 19,500 pigs a day — 5 '
'percent of U.S. pork. Most of the workers are immigrants '
'from Ethiopia, Mexico, South Sudan, Honduras, Myanmar, '
'Somalia, Guatemala, and other poor countries.\n\nInevitably '
'workers must pass within one foot of hundreds of colleagues '
'in the hallways, locker rooms, cafeterias, and cutting '
'lines. The same conditions have spurred Covid-19 outbreaks '
'at meat plants from Minnesota and Wisconsin to Colorado, '
'Nebraska, Missouri, Iowa, Pennsylvania, North Carolina, and '
'Georgia.\n\n801 workers at the Sioux Falls plant have tested '
'positive, together with 206 people close to them. The '
'outbreak has killed Agustín Rodríguez Martínez, aged 64, an '
'employee with two decades of experience originally from El '
'Salvador, and Craig Allen Franken, 61, who worked for '
'Smithfield his entire adult life.\n\nThe company knew of its '
'first infection on March 24 or earlier. The virus spread '
'exponentially for several weeks. Ahead of Easter Sunday and '
'Monday (April 12-13), Smithfield promised to “completely '
'shutter” to sanitize and put up cardboard and plastic sheet '
'dividers. This would not end transmission, as potentially '
'hundreds of staff were already carrying the virus. But even '
'during this “shutdown,” many cars were seen in the parking '
'lot. The mayor admits that the company lied, and the local '
'AFL-CIO alleges the plant ran 60 percent production. On '
'Easter, with 238 known infections, Smithfield finally '
'agreed to shut down indefinitely after a request from the '
'mayor and the governor. Yet the company insisted on waiting '
'three more days to actually halt production.\n\nSmithfield '
'denied contributing to the outbreak, saying it took a “very '
'proactive approach.” Relying on racism, the company blamed '
'workers for getting themselves sick. A spokesperson said '
'the outbreak was so severe because of the plant’s “large '
'immigrant population,” claming “Living circumstances in '
'certain cultures are different than they are with your '
'traditional American family.” They slandered the workers as '
'dirty, ignorant, and untrustworthy with help from governor '
'Kristi Noem, who claimed, “99 percent of what’s going on '
'today wasn’t happening inside the facility. It was more at '
'home, where these employees were going home and spreading '
'some of the virus” by living too close together.\n\nOne '
'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
'from South Sudan, says, “With how we work on the line, '
'I would say I got sick because of them not taking safety '
'measures.” His job is “really, really close” to other '
'workers chopping fresh-killed pigs. “The job is so heavy. '
'You have to breathe so hard.”\n\nIn early March, '
'union officials requested masks, overcoats, entrance '
'checking for fevers, and less crowding in 500-capacity '
'cafeterias. But Smithfield waited on most safety measures '
'until early April. Only April 6 did they start checking for '
'fevers. Instead of protective masks, they gave out beard '
'nets.\n\nSmithfield concealed infections with a policy of '
'informing only employees whose work stations were in the '
'same area as a person who tested positive. The fact that '
'workers are required to move around was willfully ignored. '
'One worker who tested positive said, “I clearly would have '
'gotten it at the factory. This week I have worked on three '
'different floors. I’ve eaten in two different cafeterias … '
'I’ve been walking through the whole place.” Employees from '
'the eighth floor of the plant were quarantined, '
'but everyone else was told to keep working.\n\nWhat Is '
'Really Going On?\n\nAverage plant wages are around $16 an '
'hour. Smithfield never raised them. Instead, they offered '
'$500 to employees who could go all of April without an '
'unapproved day off. The company says their “Responsibility '
'Bonuses” show their “immense gratefulness” to employees '
'“for their selfless sacrifices.”\n\nMeanwhile, the local '
'Argus Leader wrote union members wanted essential-worker '
'hazard pay, which “would be considered hourly compensation '
'about 1.5 or two times their normal pay.” One worker said, '
'“I feel like they’re bribing us with [the bonus] to come to '
'work sick. That’s how you know they don’t care.”\n\nBoth '
'Sioux Falls workers killed by Covid-19 were in their '
'sixties. It is unconscionable that they were still working. '
'All meatpackers over 50 should be on paid leave. Agustín '
'Rodríguez, 64, had a rough job sawing the legs off dead '
'pigs. He mopped floors with a fever shortly before he was '
'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
'plant, he claimed, “We have continued to run our facilities '
'for one reason: to sustain our nation’s food supply.” This '
'is an effort to sweep Smithfield’s abuses under the rug, '
'as if the company were operating for public benefit. This '
'patriotic propaganda that all Americans are in it together '
'is like a drug to keep workers from getting '
'organized.\n\nThe major union in the industry, including at '
'Smithfield, is the United Food and Commercial Workers union '
'(UFCW). What union leaders have done is ultimately '
'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
'Kooper Caraway has publicly said management delayed safety '
'action as long as possible for profit. But while some '
'workers were demanding a two-week shutdown, Caraway told '
'the Argus Leader that was unrealistic because the '
'government considers the plant essential. He suggested the '
'union would be happy with minimal safety measures: “Even if '
'10 people get exposed in a day rather than 11. If you can '
'implement a program where even one or two less people get '
'exposed during a shift, that’s one or two less people.” Of '
'course reducing infections is good, but suggesting workers '
'would be satisfied if the company allowed 90% of the '
'contagion to continue is horrifying.\n\nThe response of '
'UFCW leadership was worse. As the disease was exploding, '
'they told the Argus Leader, “We applaud [Smithfield’s] '
'decision to temporarily close the plant [over Easter '
'weekend] to push for an even safer work environment.” What '
'does “even safer” mean in this context?\n\nThe union '
'bureaucracy has taken weak action elsewhere. In '
'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
'months with Cargill Meat — the same pandemic premium Amazon '
'gave workers without a union. In Nebraska, the UFCW '
'negotiated $4 hazard pay for one month with meat giant '
'JBS.\n\nThe union has said nothing about forcing companies '
'to send older workers home with pay, even though a '
'70-year-old shop steward and a 78-year-old grandfather '
'working at JBS plants were killed by Covid-19. Smithfield '
'workers were promised only two weeks of shutdown pay. For '
'many, this compensation is half their normal paycheck '
'because they routinely put in 66 hour weeks — overtime that '
'costs exhaustion and chronic pain.\n\nUnion officials '
'endeavor to cooperate with the meat companies. An Iowa UFCW '
'president actually suggested it might be impossible for '
'plants to move workers a full six feet apart and told the '
'Des Moines Register, “We can’t stop the plants. If we stop '
'the plants from running, we stop feeding the country. We '
'want to do everything we can to make sure the employees are '
'safe to keep the plant running.”\n\nEvery part of this '
'explanation directly overlaps with what the Smithfield CEO '
'said. Unfortunately, it amounts to accepting the company’s '
'excuses.\n\nThey claim that workers who do hard physical '
'labor, waking up at 4 a.m. and often working six days a '
'week for years, would be guilty of taking food away from '
'the people and hurting America if they dared to fight for '
'their human needs. But nothing is said about the company '
'raking in profits and even murdering workers to increase '
'them.\n\nSmithfield’s parent company W.H. Group, '
'which slaughters around 30 million pigs per year in plants '
'in both the United States and China, saw its profits '
'skyrocket by about one third in 2019 to $1.38 billion. It '
'is disturbing that UFCW officials do not bring up these '
'soaring profits in their response to the outbreaks. Reuters '
'published a report on the corporation’s financial success '
'in late March. The head of W.H. Group had touted to the '
'media that it got through the pandemic in China with very '
'limited impact on production.\n\nIt is true that many '
'Smithfield workers are reasonably afraid for their jobs and '
'want to keep working. A 25-year-old employee explained, '
'“I have a lot of bills. My baby’s coming soon — I have to '
'work.” At the same time, he was afraid of infecting his '
'pregnant wife. His spouse, a former employee, '
'said bitterly, “Smithfield— they don’t care about '
'employees. They only care about their money.”\n\nWorkers '
'are pressured in these two painful directions. Nonetheless, '
'work can mean solidarity. Before Smithfield even checked '
'temperatures, there was a “sick-out” strike without union '
'support by 800 to 1,000 workers at a JBS meat factory in '
'Colorado. Hundreds of workers also called in sick days at a '
'Nebraska JBS plant.\n\nTrade union leaders won’t even '
'whisper the word “strike” when thousands of workers are '
'thinking about it. They are limiting themselves to polite '
'requests. We need a workers’ movement that asks who '
'controls the factory, that threatens to disrupt the bosses’ '
'profits, and that allows workers to use their immense power '
'— this could change the meat industry and the world. '
},
{
'text':
'Smithfield employs 3,700 people at its plants in Sioux '
'Falls, South Dakota. The plant slaughters 19,500 pig a day '
'— 5 percent of U.S. pork. Most of the workers are '
'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
'Myanmar, Somalia, Guatemala, and other poor '
'countries.\n\nInevitably workers must pass within one foot '
'of hundreds of colleagues in the hallways, locker rooms, '
'cafeterias, and cutting lines. The same conditions have '
'spurred Covid-19 outbreaks at meat plants from Minnesota '
'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
'at the Sioux Falls plant have tested positive, together '
'with 206 people close to them. The outbreak has killed '
'Agustín Rodríguez Martínez, aged 64, an employee with two '
'decades of experience originally from El Salvador, '
'and Craig Allen Franken, 61, who worked for Smithfield his '
'entire adult life.\n\nThe company knew of its first '
'infection on March 24 or earlier. The virus spread '
'exponentially for several weeks. Ahead of Easter Sunday and '
'Monday (April 12-13), Smithfield promised to “completely '
'shutter” to sanitize and put up cardboard and plastic sheet '
'dividers. This would not end transmission, as potentially '
'hundreds of staff were already carrying the virus. But even '
'during this “shutdown,” many cars were seen in the parking '
'lot. The mayor admits that the company lied, and the local '
'AFL-CIO alleges the plant ran 60 percent production. On '
'Easter, with 238 known infections, Smithfield finally '
'agreed to shut down indefinitely after a request from the '
'mayor and the governor. Yet the company insisted on waiting '
'three more days to actually halt production.\n\nSmithfield '
'denied contributing to the outbreak, saying it took a “very '
'proactive approach.” Relying on racism, the company blamed '
'workers for getting themselves sick. A spokesperson said '
'the outbreak was so severe because of the plant’s “large '
'immigrant population,” claming “Living circumstances in '
'certain cultures are different than they are with your '
'traditional American family.” They slandered the workers as '
'dirty, ignorant, and untrustworthy with help from governor '
'Kristi Noem, who claimed, “99 percent of what’s going on '
'today wasn’t happening inside the facility. It was more at '
'home, where these employees were going home and spreading '
'some of the virus” by living too close together.\n\nOne '
'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
'from South Sudan, says, “With how we work on the line, '
'I would say I got sick because of them not taking safety '
'measures.” His job is “really, really close” to other '
'workers chopping fresh-killed pigs. “The job is so heavy. '
'You have to breathe so hard.”\n\nIn early March, '
'union officials requested masks, overcoats, entrance '
'checking for fevers, and less crowding in 500-capacity '
'cafeterias. But Smithfield waited on most safety measures '
'until early April. Only April 6 did they start checking for '
'fevers. Instead of protective masks, they gave out beard '
'nets.\n\nSmithfield concealed infections with a policy of '
'informing only employees whose work stations were in the '
'same area as a person who tested positive. The fact that '
'workers are required to move around was willfully ignored. '
'One worker who tested positive said, “I clearly would have '
'gotten it at the factory. This week I have worked on three '
'different floors. I’ve eaten in two different cafeterias … '
'I’ve been walking through the whole place.” Employees from '
'the eighth floor of the plant were quarantined, '
'but everyone else was told to keep working.\n\nWhat Is '
'Really Going On?\n\nAverage plant wages are around $16 an '
'hour. Smithfield never raised them. Instead, they offered '
'$500 to employees who could go all of April without an '
'unapproved day off. The company says their “Responsibility '
'Bonuses” show their “immense gratefulness” to employees '
'“for their selfless sacrifices.”\n\nMeanwhile, the local '
'Argus Leader wrote union members wanted essential-worker '
'hazard pay, which “would be considered hourly compensation '
'about 1.5 or two times their normal pay.” One worker said, '
'“I feel like they’re bribing us with [the bonus] to come to '
'work sick. That’s how you know they don’t care.”\n\nBoth '
'Sioux Falls workers killed by Covid-19 were in their '
'sixties. It is unconscionable that they were still working. '
'All meatpackers over 50 should be on paid leave. Agustín '
'Rodríguez, 64, had a rough job sawing the legs off dead '
'pigs. He mopped floors with a fever shortly before he was '
'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
'plant, he claimed, “We have continued to run our facilities '
'for one reason: to sustain our nation’s food supply.” This '
'is an effort to sweep Smithfield’s abuses under the rug, '
'as if the company were operating for public benefit. This '
'patriotic propaganda that all Americans are in it together '
'is like a drug to keep workers from getting '
'organized.\n\nThe major union in the industry, including at '
'Smithfield, is the United Food and Commercial Workers union '
'(UFCW). What union leaders have done is ultimately '
'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
'Kooper Caraway has publicly said management delayed safety '
'action as long as possible for profit. But while some '
'workers were demanding a two-week shutdown, Caraway told '
'the Argus Leader that was unrealistic because the '
'government considers the plant essential. He suggested the '
'union would be happy with minimal safety measures: “Even if '
'10 people get exposed in a day rather than 11. If you can '
'implement a program where even one or two less people get '
'exposed during a shift, that’s one or two less people.” Of '
'course reducing infections is good, but suggesting workers '
'would be satisfied if the company allowed 90% of the '
'contagion to continue is horrifying.\n\nThe response of '
'UFCW leadership was worse. As the disease was exploding, '
'they told the Argus Leader, “We applaud [Smithfield’s] '
'decision to temporarily close the plant [over Easter '
'weekend] to push for an even safer work environment.” What '
'does “even safer” mean in this context?\n\nThe union '
'bureaucracy has taken weak action elsewhere. In '
'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
'months with Cargill Meat — the same pandemic premium Amazon '
'gave workers without a union. In Nebraska, the UFCW '
'negotiated $4 hazard pay for one month with meat giant '
'JBS.\n\nThe union has said nothing about forcing companies '
'to send older workers home with pay, even though a '
'70-year-old shop steward and a 78-year-old grandfather '
'working at JBS plants were killed by Covid-19. Smithfield '
'workers were promised only two weeks of shutdown pay. For '
'many, this compensation is half their normal paycheck '
'because they routinely put in 66 hour weeks — overtime that '
'costs exhaustion and chronic pain.\n\nUnion officials '
'endeavor to cooperate with the meat companies. An Iowa UFCW '
'president actually suggested it might be impossible for '
'plants to move workers a full six feet apart and told the '
'Des Moines Register, “We can’t stop the plants. If we stop '
'the plants from running, we stop feeding the country. We '
'want to do everything we can to make sure the employees are '
'safe to keep the plant running.”\n\nEvery part of this '
'explanation directly overlaps with what the Smithfield CEO '
'said. Unfortunately, it amounts to accepting the company’s '
'excuses.\n\nThey claim that workers who do hard physical '
'labor, waking up at 4 a.m. and often working six days a '
'week for years, would be guilty of taking food away from '
'the people and hurting America if they dared to fight for '
'their human needs. But nothing is said about the company '
'raking in profits and even murdering workers to increase '
'them.\n\nSmithfield’s parent company W.H. Group, '
'which slaughters around 30 million pigs per year in plants '
'in both the United States and China, saw its profits '
'skyrocket by about one third in 2019 to $1.38 billion. It '
'is disturbing that UFCW officials do not bring up these '
'soaring profits in their response to the outbreaks. Reuters '
'published a report on the corporation’s financial success '
'in late March. The head of W.H. Group had touted to the '
'media that it got through the pandemic in China with very '
'limited impact on production.\n\nIt is true that many '
'Smithfield workers are reasonably afraid for their jobs and '
'want to keep working. A 25-year-old employee explained, '
'“I have a lot of bills. My baby’s coming soon — I have to '
'work.” At the same time, he was afraid of infecting his '
'pregnant wife. His spouse, a former employee, '
'said bitterly, “Smithfield— they don’t care about '
'employees. They only care about their money.”\n\nWorkers '
'are pressured in these two painful directions. Nonetheless, '
'work can mean solidarity. Before Smithfield even checked '
'temperatures, there was a “sick-out” strike without union '
'support by 800 to 1,000 workers at a JBS meat factory in '
'Colorado. Hundreds of workers also called in sick days at a '
'Nebraska JBS plant.\n\nTrade union leaders won’t even '
'whisper the word “strike” when thousands of workers are '
'thinking about it. They are limiting themselves to polite '
'requests. We need a workers’ movement that asks who '
'controls the factory, that threatens to disrupt the bosses’ '
'profits, and that allows workers to use their immense power '
'— this could change the meat industry and the world. '
},
{
'text':
'Smithfield employs 3,700 people at its plant in Sioux '
'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
'— 5 percent of U.S. pork. Most of the workers are '
'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
'Myanmar, Somalia, Guatemala, and other poor '
'countries.\n\nInevitably workers must pass within one foot '
'of hundreds of colleagues in the hallways, locker rooms, '
'cafeterias, and cutting lines. The same conditions have '
'spurred Covid-19 outbreaks at meat plants from Minnesota '
'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
'at the Sioux Falls plant have tested positive, together '
'with 206 people close to them. The outbreak has killed '
'Agustín Rodríguez Martínez, aged 64, an employee with two '
'decades of experience originally from El Salvador, '
'and Craig Allen Franken, 61, who worked for Smithfield his '
'entire adult life.\n\nThe company knew of its first '
'infection on March 24 or earlier. The virus spread '
'exponentially for several weeks. Ahead of Easter Sunday and '
'Monday (April 12-13), Smithfield promised to “completely '
'shutter” to sanitize and put up cardboard and plastic sheet '
'dividers. This would not end transmission, as potentially '
'hundreds of staff were already carrying the virus. But even '
'during this “shutdown,” many cars were seen in the parking '
'lot. The mayor admits that the company lied, and the local '
'AFL-CIO alleges the plant ran 60 percent production. On '
'Easter, with 238 known infections, Smithfield finally '
'agreed to shut down indefinitely after a request from the '
'mayor and the governor. Yet the company insisted on waiting '
'three more days to actually halt production.\n\nSmithfield '
'denied contributing to the outbreak, saying it took a “very '
'proactive approach.” Relying on racism, the company blamed '
'workers for getting themselves sick. A spokesperson said '
'the outbreak was so severe because of the plant’s “large '
'immigrant population,” claming “Living circumstances in '
'certain cultures are different than they are with your '
'traditional American family.” They slandered the workers as '
'dirty, ignorant, and untrustworthy with help from governor '
'Kristi Noem, who claimed, “99 percent of what’s going on '
'today wasn’t happening inside the facility. It was more at '
'home, where these employees were going home and spreading '
'some of the virus” by living too close together.\n\nOne '
'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
'from South Sudan, says, “With how we work on the line, '
'I would say I got sick because of them not taking safety '
'measures.” His job is “really, really close” to other '
'workers chopping fresh-killed pigs. “The job is so heavy. '
'You have to breathe so hard.”\n\nIn early March, '
'union officials requested masks, overcoats, entrance '
'checking for fevers, and less crowding in 500-capacity '
'cafeterias. But Smithfield waited on most safety measures '
'until early April. Only April 6 did they start checking for '
'fevers. Instead of protective masks, they gave out beard '
'nets.\n\nSmithfield concealed infections with a policy of '
'informing only employees whose work stations were in the '
'same area as a person who tested positive. The fact that '
'workers are required to move around was willfully ignored. '
'One worker who tested positive said, “I clearly would have '
'gotten it at the factory. This week I have worked on three '
'different floors. I’ve eaten in two different cafeterias … '
'I’ve been walking through the whole place.” Employees from '
'the eighth floor of the plant were quarantined, '
'but everyone else was told to keep working.\n\nWhat Is '
'Really Going On?\n\nAverage plant wages are around $16 an '
'hour. Smithfield never raised them. Instead, they offered '
'$500 to employees who could go all of April without an '
'unapproved day off. The company says their “Responsibility '
'Bonuses” show their “immense gratefulness” to employees '
'“for their selfless sacrifices.”\n\nMeanwhile, the local '
'Argus Leader wrote union members wanted essential-worker '
'hazard pay, which “would be considered hourly compensation '
'about 1.5 or two times their normal pay.” One worker said, '
'“I feel like they’re bribing us with [the bonus] to come to '
'work sick. That’s how you know they don’t care.”\n\nBoth '
'Sioux Falls workers killed by Covid-19 were in their '
'sixties. It is unconscionable that they were still working. '
'All meatpackers over 50 should be on paid leave. Agustín '
'Rodríguez, 64, had a rough job sawing the legs off dead '
'pigs. He mopped floors with a fever shortly before he was '
'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
'plant, he claimed, “We have continued to run our facilities '
'for one reason: to sustain our nation’s food supply.” This '
'is an effort to sweep Smithfield’s abuses under the rug, '
'as if the company were operating for public benefit. This '
'patriotic propaganda that all Americans are in it together '
'is like a drug to keep workers from getting organized. '
},
{
'text':
'Smithfield employs 3,700 people at its plants in Sioux '
'Falls, South Dakota. The plant slaughters 19,500 pig a day '
'— 5 percent of U.S. pork. Most of the workers are '
'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
'Myanmar, Somalia, Guatemala, and other poor '
'countries.\n\nInevitably workers must pass within one foot '
'of hundreds of colleagues in the hallways, locker rooms, '
'cafeterias, and cutting lines. The same conditions have '
'spurred Covid-19 outbreaks at meat plants from Minnesota '
'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
'at the Sioux Falls plant have tested positive, together '
'with 206 people close to them. The outbreak has killed '
'Agustín Rodríguez Martínez, aged 64, an employee with two '
'decades of experience originally from El Salvador, '
'and Craig Allen Franken, 61, who worked for Smithfield his '
'entire adult life.\n\nThe company knew of its first '
'infection on March 24 or earlier. The virus spread '
'exponentially for several weeks. Ahead of Easter Sunday and '
'Monday (April 12-13), Smithfield promised to “completely '
'shutter” to sanitize and put up cardboard and plastic sheet '
'dividers. This would not end transmission, as potentially '
'hundreds of staff were already carrying the virus. But even '
'during this “shutdown,” many cars were seen in the parking '
'lot. The mayor admits that the company lied, and the local '
'AFL-CIO alleges the plant ran 60 percent production. On '
'Easter, with 238 known infections, Smithfield finally '
'agreed to shut down indefinitely after a request from the '
'mayor and the governor. Yet the company insisted on waiting '
'three more days to actually halt production.\n\nSmithfield '
'denied contributing to the outbreak, saying it took a “very '
'proactive approach.” Relying on racism, the company blamed '
'workers for getting themselves sick. A spokesperson said '
'the outbreak was so severe because of the plant’s “large '
'immigrant population,” claming “Living circumstances in '
'certain cultures are different than they are with your '
'traditional American family.” They slandered the workers as '
'dirty, ignorant, and untrustworthy with help from governor '
'Kristi Noem, who claimed, “99 percent of what’s going on '
'today wasn’t happening inside the facility. It was more at '
'home, where these employees were going home and spreading '
'some of the virus” by living too close together.\n\nOne '
'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
'from South Sudan, says, “With how we work on the line, '
'I would say I got sick because of them not taking safety '
'measures.” His job is “really, really close” to other '
'workers chopping fresh-killed pigs. “The job is so heavy. '
'You have to breathe so hard.”\n\nIn early March, '
'union officials requested masks, overcoats, entrance '
'checking for fevers, and less crowding in 500-capacity '
'cafeterias. But Smithfield waited on most safety measures '
'until early April. Only April 6 did they start checking for '
'fevers. Instead of protective masks, they gave out beard '
'nets.\n\nSmithfield concealed infections with a policy of '
'informing only employees whose work stations were in the '
'same area as a person who tested positive. The fact that '
'workers are required to move around was willfully ignored. '
'One worker who tested positive said, “I clearly would have '
'gotten it at the factory. This week I have worked on three '
'different floors. I’ve eaten in two different cafeterias … '
'I’ve been walking through the whole place.” Employees from '
'the eighth floor of the plant were quarantined, '
'but everyone else was told to keep working.\n\nWhat Is '
'Really Going On?\n\nAverage plant wages are around $16 an '
'hour. Smithfield never raised them. Instead, they offered '
'$500 to employees who could go all of April without an '
'unapproved day off. The company says their “Responsibility '
'Bonuses” show their “immense gratefulness” to employees '
'“for their selfless sacrifices.”\n\nMeanwhile, the local '
'Argus Leader wrote union members wanted essential-worker '
'hazard pay, which “would be considered hourly compensation '
'about 1.5 or two times their normal pay.” One worker said, '
'“I feel like they’re bribing us with [the bonus] to come to '
'work sick. That’s how you know they don’t care.”\n\nBoth '
'Sioux Falls workers killed by Covid-19 were in their '
'sixties. It is unconscionable that they were still working. '
'All meatpackers over 50 should be on paid leave. Agustín '
'Rodríguez, 64, had a rough job sawing the legs off dead '
'pigs. He mopped floors with a fever shortly before he was '
'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
'plant, he claimed, “We have continued to run our facilities '
'for one reason: to sustain our nation’s food supply.” This '
'is an effort to sweep Smithfield’s abuses under the rug, '
'as if the company were operating for public benefit. This '
'patriotic propaganda that all Americans are in it together '
'is like a drug to keep workers from getting organized. '
},
{
'text':
'This paper proposed a novel method on LLM pretraining.'
},
]
tgt_list = [
{
'text': 'Today is Sunday and it\'s a happy day!'
},
{
'text': 'Do you need a cup of coffee?'
},
{
'text': 'Today is sunday and it\'s really a happy day!'
},
{
'text':
'This paper proposed a novel method on LLM pretraining.'
},
{
'text':
'Smithfield employs 3,700 people at its plant in Sioux '
'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
'— 5 percent of U.S. pork. Most of the workers are '
'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
'Myanmar, Somalia, Guatemala, and other poor '
'countries.\n\nInevitably workers must pass within one foot '
'of hundreds of colleagues in the hallways, locker rooms, '
'cafeterias, and cutting lines. The same conditions have '
'spurred Covid-19 outbreaks at meat plants from Minnesota '
'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
'at the Sioux Falls plant have tested positive, together '
'with 206 people close to them. The outbreak has killed '
'Agustín Rodríguez Martínez, aged 64, an employee with two '
'decades of experience originally from El Salvador, '
'and Craig Allen Franken, 61, who worked for Smithfield his '
'entire adult life.\n\nThe company knew of its first '
'infection on March 24 or earlier. The virus spread '
'exponentially for several weeks. Ahead of Easter Sunday and '
'Monday (April 12-13), Smithfield promised to “completely '
'shutter” to sanitize and put up cardboard and plastic sheet '
'dividers. This would not end transmission, as potentially '
'hundreds of staff were already carrying the virus. But even '
'during this “shutdown,” many cars were seen in the parking '
'lot. The mayor admits that the company lied, and the local '
'AFL-CIO alleges the plant ran 60 percent production. On '
'Easter, with 238 known infections, Smithfield finally '
'agreed to shut down indefinitely after a request from the '
'mayor and the governor. Yet the company insisted on waiting '
'three more days to actually halt production.\n\nSmithfield '
'denied contributing to the outbreak, saying it took a “very '
'proactive approach.” Relying on racism, the company blamed '
'workers for getting themselves sick. A spokesperson said '
'the outbreak was so severe because of the plant’s “large '
'immigrant population,” claming “Living circumstances in '
'certain cultures are different than they are with your '
'traditional American family.” They slandered the workers as '
'dirty, ignorant, and untrustworthy with help from governor '
'Kristi Noem, who claimed, “99 percent of what’s going on '
'today wasn’t happening inside the facility. It was more at '
'home, where these employees were going home and spreading '
'some of the virus” by living too close together.\n\nOne '
'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
'from South Sudan, says, “With how we work on the line, '
'I would say I got sick because of them not taking safety '
'measures.” His job is “really, really close” to other '
'workers chopping fresh-killed pigs. “The job is so heavy. '
'You have to breathe so hard.”\n\nIn early March, '
'union officials requested masks, overcoats, entrance '
'checking for fevers, and less crowding in 500-capacity '
'cafeterias. But Smithfield waited on most safety measures '
'until early April. Only April 6 did they start checking for '
'fevers. Instead of protective masks, they gave out beard '
'nets.\n\nSmithfield concealed infections with a policy of '
'informing only employees whose work stations were in the '
'same area as a person who tested positive. The fact that '
'workers are required to move around was willfully ignored. '
'One worker who tested positive said, “I clearly would have '
'gotten it at the factory. This week I have worked on three '
'different floors. I’ve eaten in two different cafeterias … '
'I’ve been walking through the whole place.” Employees from '
'the eighth floor of the plant were quarantined, '
'but everyone else was told to keep working.\n\nWhat Is '
'Really Going On?\n\nAverage plant wages are around $16 an '
'hour. Smithfield never raised them. Instead, they offered '
'$500 to employees who could go all of April without an '
'unapproved day off. The company says their “Responsibility '
'Bonuses” show their “immense gratefulness” to employees '
'“for their selfless sacrifices.”\n\nMeanwhile, the local '
'Argus Leader wrote union members wanted essential-worker '
'hazard pay, which “would be considered hourly compensation '
'about 1.5 or two times their normal pay.” One worker said, '
'“I feel like they’re bribing us with [the bonus] to come to '
'work sick. That’s how you know they don’t care.”\n\nBoth '
'Sioux Falls workers killed by Covid-19 were in their '
'sixties. It is unconscionable that they were still working. '
'All meatpackers over 50 should be on paid leave. Agustín '
'Rodríguez, 64, had a rough job sawing the legs off dead '
'pigs. He mopped floors with a fever shortly before he was '
'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
'plant, he claimed, “We have continued to run our facilities '
'for one reason: to sustain our nation’s food supply.” This '
'is an effort to sweep Smithfield’s abuses under the rug, '
'as if the company were operating for public benefit. This '
'patriotic propaganda that all Americans are in it together '
'is like a drug to keep workers from getting '
'organized.\n\nThe major union in the industry, including at '
'Smithfield, is the United Food and Commercial Workers union '
'(UFCW). What union leaders have done is ultimately '
'troubling.\n\nCan Workers Fight?\n\nLocal AFL-CIO president '
'Kooper Caraway has publicly said management delayed safety '
'action as long as possible for profit. But while some '
'workers were demanding a two-week shutdown, Caraway told '
'the Argus Leader that was unrealistic because the '
'government considers the plant essential. He suggested the '
'union would be happy with minimal safety measures: “Even if '
'10 people get exposed in a day rather than 11. If you can '
'implement a program where even one or two less people get '
'exposed during a shift, that’s one or two less people.” Of '
'course reducing infections is good, but suggesting workers '
'would be satisfied if the company allowed 90% of the '
'contagion to continue is horrifying.\n\nThe response of '
'UFCW leadership was worse. As the disease was exploding, '
'they told the Argus Leader, “We applaud [Smithfield’s] '
'decision to temporarily close the plant [over Easter '
'weekend] to push for an even safer work environment.” What '
'does “even safer” mean in this context?\n\nThe union '
'bureaucracy has taken weak action elsewhere. In '
'Pennsylvania, the UFCW negotiated $2 hazard pay for two '
'months with Cargill Meat — the same pandemic premium Amazon '
'gave workers without a union. In Nebraska, the UFCW '
'negotiated $4 hazard pay for one month with meat giant '
'JBS.\n\nThe union has said nothing about forcing companies '
'to send older workers home with pay, even though a '
'70-year-old shop steward and a 78-year-old grandfather '
'working at JBS plants were killed by Covid-19. Smithfield '
'workers were promised only two weeks of shutdown pay. For '
'many, this compensation is half their normal paycheck '
'because they routinely put in 66 hour weeks — overtime that '
'costs exhaustion and chronic pain.\n\nUnion officials '
'endeavor to cooperate with the meat companies. An Iowa UFCW '
'president actually suggested it might be impossible for '
'plants to move workers a full six feet apart and told the '
'Des Moines Register, “We can’t stop the plants. If we stop '
'the plants from running, we stop feeding the country. We '
'want to do everything we can to make sure the employees are '
'safe to keep the plant running.”\n\nEvery part of this '
'explanation directly overlaps with what the Smithfield CEO '
'said. Unfortunately, it amounts to accepting the company’s '
'excuses.\n\nThey claim that workers who do hard physical '
'labor, waking up at 4 a.m. and often working six days a '
'week for years, would be guilty of taking food away from '
'the people and hurting America if they dared to fight for '
'their human needs. But nothing is said about the company '
'raking in profits and even murdering workers to increase '
'them.\n\nSmithfield’s parent company W.H. Group, '
'which slaughters around 30 million pigs per year in plants '
'in both the United States and China, saw its profits '
'skyrocket by about one third in 2019 to $1.38 billion. It '
'is disturbing that UFCW officials do not bring up these '
'soaring profits in their response to the outbreaks. Reuters '
'published a report on the corporation’s financial success '
'in late March. The head of W.H. Group had touted to the '
'media that it got through the pandemic in China with very '
'limited impact on production.\n\nIt is true that many '
'Smithfield workers are reasonably afraid for their jobs and '
'want to keep working. A 25-year-old employee explained, '
'“I have a lot of bills. My baby’s coming soon — I have to '
'work.” At the same time, he was afraid of infecting his '
'pregnant wife. His spouse, a former employee, '
'said bitterly, “Smithfield— they don’t care about '
'employees. They only care about their money.”\n\nWorkers '
'are pressured in these two painful directions. Nonetheless, '
'work can mean solidarity. Before Smithfield even checked '
'temperatures, there was a “sick-out” strike without union '
'support by 800 to 1,000 workers at a JBS meat factory in '
'Colorado. Hundreds of workers also called in sick days at a '
'Nebraska JBS plant.\n\nTrade union leaders won’t even '
'whisper the word “strike” when thousands of workers are '
'thinking about it. They are limiting themselves to polite '
'requests. We need a workers’ movement that asks who '
'controls the factory, that threatens to disrupt the bosses’ '
'profits, and that allows workers to use their immense power '
'— this could change the meat industry and the world. '
},
{
'text':
'Smithfield employs 3,700 people at its plant in Sioux '
'Falls, South Dakota. The plant slaughters 19,500 pigs a day '
'— 5 percent of U.S. pork. Most of the workers are '
'immigrants from Ethiopia, Mexico, South Sudan, Honduras, '
'Myanmar, Somalia, Guatemala, and other poor '
'countries.\n\nInevitably workers must pass within one foot '
'of hundreds of colleagues in the hallways, locker rooms, '
'cafeterias, and cutting lines. The same conditions have '
'spurred Covid-19 outbreaks at meat plants from Minnesota '
'and Wisconsin to Colorado, Nebraska, Missouri, Iowa, '
'Pennsylvania, North Carolina, and Georgia.\n\n801 workers '
'at the Sioux Falls plant have tested positive, together '
'with 206 people close to them. The outbreak has killed '
'Agustín Rodríguez Martínez, aged 64, an employee with two '
'decades of experience originally from El Salvador, '
'and Craig Allen Franken, 61, who worked for Smithfield his '
'entire adult life.\n\nThe company knew of its first '
'infection on March 24 or earlier. The virus spread '
'exponentially for several weeks. Ahead of Easter Sunday and '
'Monday (April 12-13), Smithfield promised to “completely '
'shutter” to sanitize and put up cardboard and plastic sheet '
'dividers. This would not end transmission, as potentially '
'hundreds of staff were already carrying the virus. But even '
'during this “shutdown,” many cars were seen in the parking '
'lot. The mayor admits that the company lied, and the local '
'AFL-CIO alleges the plant ran 60 percent production. On '
'Easter, with 238 known infections, Smithfield finally '
'agreed to shut down indefinitely after a request from the '
'mayor and the governor. Yet the company insisted on waiting '
'three more days to actually halt production.\n\nSmithfield '
'denied contributing to the outbreak, saying it took a “very '
'proactive approach.” Relying on racism, the company blamed '
'workers for getting themselves sick. A spokesperson said '
'the outbreak was so severe because of the plant’s “large '
'immigrant population,” claming “Living circumstances in '
'certain cultures are different than they are with your '
'traditional American family.” They slandered the workers as '
'dirty, ignorant, and untrustworthy with help from governor '
'Kristi Noem, who claimed, “99 percent of what’s going on '
'today wasn’t happening inside the facility. It was more at '
'home, where these employees were going home and spreading '
'some of the virus” by living too close together.\n\nOne '
'sick worker, Michael Bul Gayo Gatluak, 22 and originally '
'from South Sudan, says, “With how we work on the line, '
'I would say I got sick because of them not taking safety '
'measures.” His job is “really, really close” to other '
'workers chopping fresh-killed pigs. “The job is so heavy. '
'You have to breathe so hard.”\n\nIn early March, '
'union officials requested masks, overcoats, entrance '
'checking for fevers, and less crowding in 500-capacity '
'cafeterias. But Smithfield waited on most safety measures '
'until early April. Only April 6 did they start checking for '
'fevers. Instead of protective masks, they gave out beard '
'nets.\n\nSmithfield concealed infections with a policy of '
'informing only employees whose work stations were in the '
'same area as a person who tested positive. The fact that '
'workers are required to move around was willfully ignored. '
'One worker who tested positive said, “I clearly would have '
'gotten it at the factory. This week I have worked on three '
'different floors. I’ve eaten in two different cafeterias … '
'I’ve been walking through the whole place.” Employees from '
'the eighth floor of the plant were quarantined, '
'but everyone else was told to keep working.\n\nWhat Is '
'Really Going On?\n\nAverage plant wages are around $16 an '
'hour. Smithfield never raised them. Instead, they offered '
'$500 to employees who could go all of April without an '
'unapproved day off. The company says their “Responsibility '
'Bonuses” show their “immense gratefulness” to employees '
'“for their selfless sacrifices.”\n\nMeanwhile, the local '
'Argus Leader wrote union members wanted essential-worker '
'hazard pay, which “would be considered hourly compensation '
'about 1.5 or two times their normal pay.” One worker said, '
'“I feel like they’re bribing us with [the bonus] to come to '
'work sick. That’s how you know they don’t care.”\n\nBoth '
'Sioux Falls workers killed by Covid-19 were in their '
'sixties. It is unconscionable that they were still working. '
'All meatpackers over 50 should be on paid leave. Agustín '
'Rodríguez, 64, had a rough job sawing the legs off dead '
'pigs. He mopped floors with a fever shortly before he was '
'hospitalized.\n\nWhen CEO Kenneth Sullivan closed the '
'plant, he claimed, “We have continued to run our facilities '
'for one reason: to sustain our nation’s food supply.” This '
'is an effort to sweep Smithfield’s abuses under the rug, '
'as if the company were operating for public benefit. This '
'patriotic propaganda that all Americans are in it together '
'is like a drug to keep workers from getting organized. '
},
]
dataset = Dataset.from_list(ds_list)
op = DocumentMinhashDeduplicator(ignore_pattern=r'\p{P}')
self._run_minhash_dedup(dataset, tgt_list, op)
def test_chinese_deduplication(self):
ds_list = [
{
'text': '你好,请问你是谁'
},
{
'text': '欢迎来到阿里巴巴!'
},
{
'text':
'第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
'律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
'海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
'以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
'会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
'的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
'户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
'议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
'迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
'份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
'行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
'未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
'济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
'根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
'215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
'行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
'的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
'金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
'成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
'处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
'提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
'的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
'信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
'从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
'未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
'\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
'务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
'务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
'斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
'\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
'术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
'\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
'中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
'日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
'美元至215 000美元(四舍五入)。'
},
{
'text':
'第九届会议\n时间:2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
'律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
'海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
'以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
'会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
'的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
'户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
'议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
'迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
'份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
'行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
'未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
'济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
'根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
'215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
'行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
'的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
'金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
'成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
'处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
'提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
'的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
'信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
'从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
'未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
'\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
'务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
'务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
'斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
'\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
'术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
'\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
'中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
'日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
'美元至215 000美元(四舍五入)。'
},
]
tgt_list = [
{
'text': '你好,请问你是谁'
},
{
'text': '欢迎来到阿里巴巴!'
},
{
'text':
'第九届会议\n2003年7月28日至8月8日\n牙买加金斯敦\n为来自发展中国家的法'
'律和技术委员会以及财务委员会成员\n参加委员会会议支付费用的方式\n1. 国际'
'海底管理局大会第八届会议请秘书长采取一项临时措施,设立一个自愿信托基金,'
'以便支付来自发展中国家的法律和技术委员会成员以及来自发展中国家的财务委员'
'会成员参加委员会会议的费用。\n2. 由于秘书长向会员国发出为该信托基金捐款'
'的请求,已收到三笔捐款,共计10 500美元。 管理局已为基金设立一个单独的账'
'户。\n3. 管理局第八届会议还决定,由财务委员会审查资助参加这两个委员会会'
'议的方式,包括审查是否可能从管理局行政预算中提供经费。\n4. 自愿信托基金'
'迄今收到的捐款数额很小。 这两个委员会成员虽然由缔约国提名,但他们以个人身'
'份当选。 因此,必须确保这些机构的成员在任期内能够参加会议并且持续不断地履'
'行职务。 现已注意到,这两个委员会若干成员因旅费和生活津贴费用方面有困难而'
'未能出席会议。 来自发展中国家成员参加会议的费用估计数见附件,其中比较了经'
'济舱和公务舱机票价格以及适用于金斯敦的每日生活津贴费用。 从表中可以看出,'
'根据不同的人数、机舱等级和会议持续时间,每年平均需要捐款120 000美元至'
'215 000美元。\n5. 为了指导委员会确定提供经费的方式,对某些国际组织的现'
'行办法作了一次简要调查。 为支付参加会议的旅费和生活费而设立信托基金最相关'
'的实例是2000年大会为来自发展中国家的大陆架界限委员会成员设立的自愿信托基'
'金。 目前这一基金正在运作,但现有资源有限。 联合国制定的程序表明,委员会'
'成员的政府应在规定时间内尽可能提前提出请求。 这种请求按照先到先核可的办法'
'处理。 提供的机票将是最直接路线的经济舱机票,每日生活津贴将按照联合国费率'
'提供。 购买机票的所有安排均由联合国秘书处执行。\n6. 虽然已经设立了临时性'
'的自愿信托基金,但是,对该基金的捐款数额很小,捐款速度很慢。 因此,除了对'
'信托基金提供自愿捐款的办法之外,建议委员会还可以考虑采用下列办法:\n(a) '
'从管理局一般行政经费累计利息中拨出一定数额的经费;\n(b) 每年从上一年预算'
'未动用部分中拨出规定的数额;\n(c) 从先驱投资者基金利息中拨出规定的数额。'
'\n7. 委员会还不妨建议由管理局秘书处依照行政规则和程序管理该基金,并向财'
'务委员会提出一份报告。\n附件\n资助来自发展中国家的法律和技术委员会以及财'
'务\n委员会成员出席会议的指示性费用(美元)\n成员\n机票\n机场\n费用\n金'
'斯敦每日生活\n津贴\n转机途中每日生活\n7日\n共计\n14日\n经济舱\n公务舱'
'\n7天=(8天每日生活\n津贴)\n14天= (15天每日生活津贴)\n商务舱\n法律和技'
'术委员会\n印度尼西亚\n(纽约)\n黎巴嫩\n巴基斯坦\n阿根廷\n喀麦隆\n墨西哥'
'\n巴西\n塞内加尔\n莫桑比克\n埃及(纽约)\n大韩民国\n印度\n斐济\n智利\n'
'中国\n纳米比亚\n小计\n财务委员会\n缅甸\n乌干达\n牙买加\n印度(纽约)\n尼'
'日利亚\n总计\n注:估计费用表表明每年资助每个机构一次会议需要经费120 000'
'美元至215 000美元(四舍五入)。'
},
]
dataset = Dataset.from_list(ds_list)
op = DocumentMinhashDeduplicator(tokenization='character',
ignore_pattern=r'\p{P}')
self._run_minhash_dedup(dataset, tgt_list, op)
if __name__ == '__main__':
unittest.main()