forked from huggingface/datatrove
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data.py
52 lines (37 loc) · 1.12 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
"""Data classes for the datatrove package."""
from dataclasses import dataclass, field
from typing import Generator, NewType
class MediaType:
"""Media types
For future uses, currently not used.
"""
IMAGE = 0
VIDEO = 1
AUDIO = 2
@dataclass
class Media:
"""Media metadata
For future uses, currently not used.
"""
type: int
url: str
alt: str | None = None
local_path: str | None = None
@dataclass
class Document:
"""Main Document dataclass going through the processing pipeline
Args:
text: str
the actual text content for each sample
id: str
a unique id (string) for this sample
media: list[Media]
The media associated with the document
metadata: dict[str, str | int | float | bool]
a dictionary where any additional info may be stored
"""
text: str
id: str
media: list[Media] = field(default_factory=list)
metadata: dict[str, str | int | float | bool] = field(default_factory=dict)
DocumentsPipeline = NewType("DocumentsPipeline", Generator[Document, None, None] | None)