forked from AIObjectives/talk-to-the-city-reports
-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_to_csv_v0.ts
122 lines (107 loc) · 3.5 KB
/
text_to_csv_v0.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import nodes from '$lib/node_register';
import categories from '$lib/node_categories';
import type { DGNodeInterface, BaseData } from '$lib/node_data_types';
import { getEncoding } from 'js-tiktoken';
import { format, unwrapFunctionStore } from 'svelte-i18n';
import _ from 'lodash';
const $__ = unwrapFunctionStore(format);
export default class TextToCSV {
id: string;
data: TextToCSVData;
position: { x: number; y: number };
type: string;
constructor(node_data: TextToCSVInterface) {
const { id, data, position, type } = node_data;
this.id = id;
this.data = data;
this.position = position;
this.type = type;
}
toCSV(val, interview, video, timestamp, startIndex = 0) {
const encoding = getEncoding('cl100k_base');
const tokens = encoding.encode(val);
const commentChunks = [];
for (let i = 0; i < tokens.length; i += parseInt(this.data.numTokens)) {
const chunkTokens = tokens.slice(i, i + parseInt(this.data.numTokens));
const commentChunk = chunkTokens.map((token) => encoding.decode([token])).join('');
commentChunks.push(commentChunk);
}
const docs = [];
commentChunks.forEach((chunk, index) => {
const doc = {
'comment-body': chunk,
'comment-id': (startIndex + index).toString(),
interview: interview
};
if (video) {
doc['video'] = video;
doc['timestamp'] = timestamp;
}
docs.push(doc);
});
return docs;
}
async compute(inputData, context, info, error, success, slug, Cookies) {
this.data.dirty = false;
const res = [];
const inputs = _.values(inputData);
_.forEach(inputs, (input, i) => {
if (_.isString(input)) {
const interview = this.data.entries[i]?.interview || this.data.entries[0].interview;
const video = this.data.entries[i]?.video || this.data.entries[0].video;
const timestamp = this.data.entries[i]?.timestamp || this.data.entries[0].timestamp;
const csv = this.toCSV(input, interview, video, timestamp, res.length);
res.push(...csv);
} else if (_.isArray(input)) {
input.forEach((val, j) => {
const interview = this.data.entries[j]?.interview || this.data.entries[0].interview;
const video = this.data.entries[j]?.video || this.data.entries[0].video;
const timestamp = this.data.entries[j]?.timestamp || this.data.entries[0].timestamp;
const csv = this.toCSV(val, interview, video, timestamp, res.length);
res.push(...csv);
});
}
});
this.data.output = res;
return res;
}
}
interface TextToCSVEntry {
interview: string;
video: string;
timestamp: string;
}
interface TextToCSVData extends BaseData {
output: any;
numTokens: string;
entries: TextToCSVEntry[];
}
type TextToCSVInterface = DGNodeInterface & {
data: TextToCSVData;
};
export const text_to_csv_node_data: TextToCSVInterface = {
id: 'text_to_csv',
data: {
label: 'text_to_csv',
dirty: false,
compute_type: 'text_to_csv_v0',
input_ids: { data: '' },
category: categories.wrangling.id,
icon: 'text_to_csv_v0',
show_in_ui: true,
message: '',
output: null,
numTokens: '1000',
entries: [
{
interview: 'Alice',
video: 'https://www.youtube.com/watch?v=1qKz9W3bKbE',
timestamp: '00:00:00'
}
]
},
position: { x: 0, y: 0 },
type: 'text_to_csv_v0'
};
export const text_to_csv_node = new TextToCSV(text_to_csv_node_data);
nodes.register(TextToCSV, text_to_csv_node_data);