-
Notifications
You must be signed in to change notification settings - Fork 2
/
python_files.txt
6709 lines (6149 loc) · 255 KB
/
python_files.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<files>
<file>
<file_path>docs/src/examples/modal_langchain.py</file_path>
<file_content>
import sys
from modal import Secret, Stub, Image, web_endpoint
import lancedb
import re
import pickle
import requests
import zipfile
from pathlib import Path
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import LanceDB
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
lancedb_image = Image.debian_slim().pip_install(
"lancedb", "langchain", "openai", "pandas", "tiktoken", "unstructured", "tabulate"
)
stub = Stub(
name="example-langchain-lancedb",
image=lancedb_image,
secrets=[Secret.from_name("my-openai-secret")],
)
docsearch = None
docs_path = Path("docs.pkl")
db_path = Path("lancedb")
def get_document_title(document):
m = str(document.metadata["source"])
title = re.findall("pandas.documentation(.*).html", m)
if title[0] is not None:
return title[0]
return ""
def download_docs():
pandas_docs = requests.get(
"https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip"
)
with open(Path("pandas.documentation.zip"), "wb") as f:
f.write(pandas_docs.content)
file = zipfile.ZipFile(Path("pandas.documentation.zip"))
file.extractall(path=Path("pandas_docs"))
def store_docs():
docs = []
if not docs_path.exists():
for p in Path("pandas_docs/pandas.documentation").rglob("*.html"):
if p.is_dir():
continue
loader = UnstructuredHTMLLoader(p)
raw_document = loader.load()
m = {}
m["title"] = get_document_title(raw_document[0])
m["version"] = "2.0rc0"
raw_document[0].metadata = raw_document[0].metadata | m
raw_document[0].metadata["source"] = str(raw_document[0].metadata["source"])
docs = docs + raw_document
with docs_path.open("wb") as fh:
pickle.dump(docs, fh)
else:
with docs_path.open("rb") as fh:
docs = pickle.load(fh)
return docs
def qanda_langchain(query):
download_docs()
docs = store_docs()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
documents = text_splitter.split_documents(docs)
embeddings = OpenAIEmbeddings()
db = lancedb.connect(db_path)
table = db.create_table(
"pandas_docs",
data=[
{
"vector": embeddings.embed_query("Hello World"),
"text": "Hello World",
"id": "1",
}
],
mode="overwrite",
)
docsearch = LanceDB.from_documents(documents, embeddings, connection=table)
qa = RetrievalQA.from_chain_type(
llm=OpenAI(), chain_type="stuff", retriever=docsearch.as_retriever()
)
return qa.run(query)
@stub.function()
@web_endpoint(method="GET")
def web(query: str):
answer = qanda_langchain(query)
return {
"answer": answer,
}
@stub.function()
def cli(query: str):
answer = qanda_langchain(query)
print(answer)
</file_content>
<file_context>
<line>
<line_number>0, 1, 2</line_number>
<line_content>import sys,
from modal import Secret, Stub, Image, web_endpoint,
import lancedb</line_content>
<context>
The __import__() function is a wrapper around importlib.__import__(). import_module() simplifies importing modules and is the recommended programmatic way to import. find_spec() helps check if a module can be imported without loading it.</context>
</line>
<line>
<line_number>4, 5, 6</line_number>
<line_content>import pickle,
import requests,
import zipfile</line_content>
<context>
The zipimport module adds the ability to import Python modules from ZIP archives. It allows sys.path to contain paths to ZIP files, enabling modules inside those archives to be imported. The archive can have subdirectories to support package imports.</context>
</line>
<line>
<line_number>7, 9, 10</line_number>
<line_content>from pathlib import Path,
from langchain.document_loaders import UnstructuredHTMLLoader,
from langchain.embeddings import OpenAIEmbeddings</line_content>
<context>
The TraversableResources abstract base class extends ResourceReader to provide a concrete implementation for serving files through the importlib.resources module. It allows a loader to support reading package resources through both the</context>
</line>
<line>
<line_number>11, 12, 13</line_number>
<line_content>from langchain.text_splitter import RecursiveCharacterTextSplitter,
from langchain.vectorstores import LanceDB,
from langchain.llms import OpenAI</line_content>
<context>
Various methods exist for Unicode objects like concatenation, splitting, joining, finding substrings, replacing, comparison and formatting. The PyUnicode_InternInPlace and PyUnicode_InternFromString functions can intern strings.</context>
</line>
<line>
<line_number>14, 16, 17</line_number>
<line_content>from langchain.chains import RetrievalQA,
lancedb_image = Image.debian_slim().pip_install(,
'lancedb', 'langchain', 'openai', 'pandas', 'tiktoken', 'unstructured', 'tabulate'</line_content>
<context>
The importlib.metadata module provides access to the metadata of installed Python distribution packages. It can get entry points, metadata, version strings, files, and requirements for a distribution.</context>
</line>
<line>
<line_number>20, 21, 22</line_number>
<line_content>stub = Stub(,
name='example-langchain-lancedb',,
image=lancedb_image,</line_content>
<context>
how they are called. The Mock class removes the need to create multiple stubs and allows configuring return values, side effects, and tracking call arguments. Mock also supports mocking magic methods like __str__ and __len__.</context>
</line>
<line>
<line_number>23, 26, 27</line_number>
<line_content>secrets=[Secret.from_name('my-openai-secret')],,
docsearch = None,
docs_path = Path('docs.pkl')</line_content>
<context>
path. The search path can be customized by setting the PYTHONHOME or PYTHONPATH environment variables before calling Py_Initialize().</context>
</line>
<line>
<line_number>28, 31, 32</line_number>
<line_content>db_path = Path('lancedb'),
def get_document_title(document):,
m = str(document.metadata['source'])</line_content>
<context>
The pydoc module generates documentation for Python modules, functions, classes, and methods. It displays documentation derived from docstrings in multiple formats - as text on the console, served to a web browser, or saved as HTML files.</context>
</line>
<line>
<line_number>33, 34, 35</line_number>
<line_content>title = re.findall('pandas.documentation(.*).html', m),
if title[0] is not None:,
return title[0]</line_content>
<context>
The html module defines utilities for manipulating HTML in Python code. The key functions are html.escape() and html.unescape().</context>
</line>
<line>
<line_number>39, 40, 41</line_number>
<line_content>def download_docs():,
pandas_docs = requests.get(,
'https://eto-public.s3.us-west-2.amazonaws.com/datasets/pandas_docs/pandas.documentation.zip'</line_content>
<context>
The urllib.request module provides functions and classes for fetching URLs and making HTTP requests in python. Some key points:</context>
</line>
<line>
<line_number>43, 44, 46</line_number>
<line_content>with open(Path('pandas.documentation.zip'), 'wb') as f:,
f.write(pandas_docs.content),
file = zipfile.ZipFile(Path('pandas.documentation.zip'))</line_content>
<context>
The zipfile module in Python provides tools for working with ZIP archives. The module allows you to create, read, write, append, and list the contents of a ZIP file.</context>
</line>
<line>
<line_number>47, 50, 53</line_number>
<line_content>file.extractall(path=Path('pandas_docs')),
def store_docs():,
if not docs_path.exists():</line_content>
<context>
in future Python versions. The new API with files() and traversables is recommended instead.</context>
</line>
<line>
<line_number>54, 55, 57</line_number>
<line_content>for p in Path('pandas_docs/pandas.documentation').rglob('*.html'):,
if p.is_dir():,
loader = UnstructuredHTMLLoader(p)</line_content>
<context>
The html module defines utilities for manipulating HTML in Python code. The key functions are html.escape() and html.unescape().</context>
</line>
<line>
<line_number>58, 61, 62</line_number>
<line_content>raw_document = loader.load(),
m['title'] = get_document_title(raw_document[0]),
m['version'] = '2.0rc0'</line_content>
<context>
The format has changed across Python versions for compatibility reasons. There is a version argument to select the format to use. The current version is 4.</context>
</line>
<line>
<line_number>63, 64, 65</line_number>
<line_content>raw_document[0].metadata = raw_document[0].metadata | m,
raw_document[0].metadata['source'] = str(raw_document[0].metadata['source']),
docs = docs + raw_document</line_content>
<context>
- ast.parse - Parses source code into an AST
- ast.unparse - Unparses an AST back into source code
- ast.literal_eval - Safely evaluates a string with a Python literal expression
- ast.get_docstring - Gets the docstring of a node</context>
</line>
<line>
<line_number>67, 68, 70</line_number>
<line_content>with docs_path.open('wb') as fh:,
pickle.dump(docs, fh),
with docs_path.open('rb') as fh:</line_content>
<context>
The pickle interface consists of two main functions - dumps() to serialize objects to a byte stream, and loads() to deserialize the byte stream back into Python objects. There are also convenience functions like dump() and load() to work directly</context>
</line>
<line>
<line_number>71, 73, 76</line_number>
<line_content>docs = pickle.load(fh),
return docs,
def qanda_langchain(query):</line_content>
<context>
The pickle interface consists of two main functions - dumps() to serialize objects to a byte stream, and loads() to deserialize the byte stream back into Python objects. There are also convenience functions like dump() and load() to work directly</context>
</line>
<line>
<line_number>77, 78, 80</line_number>
<line_content>download_docs(),
docs = store_docs(),
text_splitter = RecursiveCharacterTextSplitter(</line_content>
<context>
The pydoc module generates documentation for Python modules, functions, classes, and methods. It displays documentation derived from docstrings in multiple formats - as text on the console, served to a web browser, or saved as HTML files.</context>
</line>
<line>
<line_number>81, 82, 84</line_number>
<line_content>chunk_size=1000,,
chunk_overlap=200,,
documents = text_splitter.split_documents(docs)</line_content>
<context>
of a chunk, you create a new Chunk instance to process the next chunk. This continues until the end of the file is reached and creating a new Chunk fails with EOFError.</context>
</line>
<line>
<line_number>85, 87, 88</line_number>
<line_content>embeddings = OpenAIEmbeddings(),
db = lancedb.connect(db_path),
table = db.create_table(</line_content>
<context>
- Creating records in tables with CreateRecord.
- Initializing a new database with init_database.
- Adding data to tables with add_data.
- Adding tables to a database with add_tables.
- Creating views on database tables with OpenView.</context>
</line>
<line>
<line_number>89, 92, 93</line_number>
<line_content>'pandas_docs',,
'vector': embeddings.embed_query('Hello World'),,
'text': 'Hello World',</line_content>
<context>
Using the embedding API an application can also extend Python by exposing functions and data from the application itself to Python code. This allows Python code to call back into the application.</context>
</line>
<line>
<line_number>94, 97, 99</line_number>
<line_content>'id': '1',,
mode='overwrite',,
docsearch = LanceDB.from_documents(documents, embeddings, connection=table)</line_content>
<context>
The doctest module checks examples in docstrings and text files, executing them and comparing the output to expected results. It contains APIs for using doctest functionality in different ways.</context>
</line>
<line>
<line_number>100, 101, 103</line_number>
<line_content>qa = RetrievalQA.from_chain_type(,
llm=OpenAI(), chain_type='stuff', retriever=docsearch.as_retriever(),
return qa.run(query)</line_content>
<context>
- urlencode() - Convert a dictionary into a urlencoded query string to be appended to a URL.
- parse_qs() and parse_qsl() - Parse query strings into Python data structures.</context>
</line>
<line>
<line_number>106, 107, 108</line_number>
<line_content>@stub.function(),
@web_endpoint(method='GET'),
def web(query: str):</line_content>
<context>
The urllib.request module provides functions and classes for fetching URLs and making HTTP requests in python. Some key points:</context>
</line>
<line>
<line_number>109, 111, 115</line_number>
<line_content>answer = qanda_langchain(query),
'answer': answer,,
@stub.function()</line_content>
<context>
- urlencode() - Convert a dictionary into a urlencoded query string to be appended to a URL.
- parse_qs() and parse_qsl() - Parse query strings into Python data structures.</context>
</line>
<line>
<line_number>116, 117, 118</line_number>
<line_content>def cli(query: str):,
answer = qanda_langchain(query),
print(answer)</line_content>
<context>
- urlencode() - Convert a dictionary into a urlencoded query string to be appended to a URL.
- parse_qs() and parse_qsl() - Parse query strings into Python data structures.
</context>
</line>
</file_context>
</file>
<file>
<file_path>docs/src/notebooks/diffusiondb/datagen.py</file_path>
<file_content>
#!/usr/bin/env python
#
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset hf://poloclub/diffusiondb
"""
import io
from argparse import ArgumentParser
from multiprocessing import Pool
import lance
import lancedb
import pyarrow as pa
from datasets import load_dataset
from PIL import Image
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast
MODEL_ID = "openai/clip-vit-base-patch32"
device = "cuda"
tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
schema = pa.schema(
[
pa.field("prompt", pa.string()),
pa.field("seed", pa.uint32()),
pa.field("step", pa.uint16()),
pa.field("cfg", pa.float32()),
pa.field("sampler", pa.string()),
pa.field("width", pa.uint16()),
pa.field("height", pa.uint16()),
pa.field("timestamp", pa.timestamp("s")),
pa.field("image_nsfw", pa.float32()),
pa.field("prompt_nsfw", pa.float32()),
pa.field("vector", pa.list_(pa.float32(), 512)),
pa.field("image", pa.binary()),
]
)
def pil_to_bytes(img) -> list[bytes]:
buf = io.BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def generate_clip_embeddings(batch) -> pa.RecordBatch:
image = processor(text=None, images=batch["image"], return_tensors="pt")[
"pixel_values"
].to(device)
img_emb = model.get_image_features(image)
batch["vector"] = img_emb.cpu().tolist()
with Pool() as p:
batch["image_bytes"] = p.map(pil_to_bytes, batch["image"])
return batch
def datagen(args):
"""Generate DiffusionDB dataset, and use CLIP model to generate image embeddings."""
dataset = load_dataset("poloclub/diffusiondb", args.subset)
data = []
for b in dataset.map(
generate_clip_embeddings, batched=True, batch_size=256, remove_columns=["image"]
)["train"]:
b["image"] = b["image_bytes"]
del b["image_bytes"]
data.append(b)
tbl = pa.Table.from_pylist(data, schema=schema)
return tbl
def main():
parser = ArgumentParser()
parser.add_argument(
"-o", "--output", metavar="DIR", help="Output lance directory", required=True
)
parser.add_argument(
"-s",
"--subset",
choices=["2m_all", "2m_first_10k", "2m_first_100k"],
default="2m_first_10k",
help="subset of the hg dataset",
)
args = parser.parse_args()
batches = datagen(args)
lance.write_dataset(batches, args.output)
if __name__ == "__main__":
main()
</file_content>
<file_context>
<line>
<line_number>15, 19, 20</line_number>
<line_content>'''Dataset hf://poloclub/diffusiondb,
from argparse import ArgumentParser,
from multiprocessing import Pool</line_content>
<context>
parsing arguments. Additional information covers building values in Python from C values.</context>
</line>
<line>
<line_number>22, 23, 24</line_number>
<line_content>import lance,
import lancedb,
import pyarrow as pa</line_content>
<context>
PyImport_ImportModuleEx() imports a module by name, with additional globals, locals, and fromlist arguments similar to Python's __import__() function. It returns the imported module or NULL if there was an error.</context>
</line>
<line>
<line_number>25, 26, 27</line_number>
<line_content>from datasets import load_dataset,
from PIL import Image,
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizerFast</line_content>
<context>
The main functions are tomllib.load() which parses a TOML file, and tomllib.loads() which parses a TOML string. They take the TOML source as the first argument, and return a dict of the parsed data. An optional parse_float argument can be passed to</context>
</line>
<line>
<line_number>29, 31, 33</line_number>
<line_content>MODEL_ID = 'openai/clip-vit-base-patch32',
device = 'cuda',
tokenizer = CLIPTokenizerFast.from_pretrained(MODEL_ID)</line_content>
<context>
The encode_* functions raise TypeError if passed a multipart message instead of encoding the subparts individually. They extract the payload, encode it, and reset the payload to the encoded value.</context>
</line>
<line>
<line_number>34, 35, 37</line_number>
<line_content>model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32').to(device),
processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32'),
schema = pa.schema(</line_content>
<context>
PySlice_New creates a new slice object given start, stop, and step values (any can be None). PySlice_GetIndices and PySlice_GetIndicesEx extract the start, stop, and step values from a slice assuming a sequence of a given length, clipping out of</context>
</line>
<line>
<line_number>39, 40, 41</line_number>
<line_content>pa.field('prompt', pa.string()),,
pa.field('seed', pa.uint32()),,
pa.field('step', pa.uint16()),</line_content>
<context>
__name__ attribute. PyModule_GetState returns the module state.</context>
</line>
<line>
<line_number>42, 43, 44</line_number>
<line_content>pa.field('cfg', pa.float32()),,
pa.field('sampler', pa.string()),,
pa.field('width', pa.uint16()),</line_content>
<context>
- Get a string representation of an object, like with PyObject_Str and PyObject_Repr. These implement str() and repr().
- Get the length or size of an object, with PyObject_Length and PyObject_Size. These implement len().</context>
</line>
<line>
<line_number>45, 46, 47</line_number>
<line_content>pa.field('height', pa.uint16()),,
pa.field('timestamp', pa.timestamp('s')),,
pa.field('image_nsfw', pa.float32()),</line_content>
<context>
PyFloat_GetInfo returns a structseq with info on float precision, max, and min. PyFloat_GetMax returns the max float DBL_MAX. PyFloat_GetMin returns the min float DBL_MIN.</context>
</line>
<line>
<line_number>48, 49, 50</line_number>
<line_content>pa.field('prompt_nsfw', pa.float32()),,
pa.field('vector', pa.list_(pa.float32(), 512)),,
pa.field('image', pa.binary()),</line_content>
<context>
the __name__ and __qualname__ attributes from the passed in name and qualname arguments. The PyCoro_New function steals a reference to the frame object passed in.</context>
</line>
<line>
<line_number>55, 56, 57</line_number>
<line_content>def pil_to_bytes(img) -> list[bytes]:,
buf = io.BytesIO(),
img.save(buf, format='PNG')</line_content>
<context>
You can concatenate bytes objects with PyBytes_Concat, which creates a new bytes object with the contents of the old and new bytes objects.</context>
</line>
<line>
<line_number>58, 61, 62</line_number>
<line_content>return buf.getvalue(),
def generate_clip_embeddings(batch) -> pa.RecordBatch:,
image = processor(text=None, images=batch['image'], return_tensors='pt')[</line_content>
<context>
- format_list() - Takes a list from extract_tb() and formats the frames for printing.
- format_exception() - Formats exception and traceback info into a list of strings for printing.</context>
</line>
<line>
<line_number>63, 64, 65</line_number>
<line_content>'pixel_values',
].to(device),
img_emb = model.get_image_features(image)</line_content>
<context>
and PyCode_GetFreevars return the names of local variables, cell variables, and free variables respectively.</context>
</line>
<line>
<line_number>66, 68, 69</line_number>
<line_content>batch['vector'] = img_emb.cpu().tolist(),
with Pool() as p:,
batch['image_bytes'] = p.map(pil_to_bytes, batch['image'])</line_content>
<context>
You can concatenate bytes objects with PyBytes_Concat, which creates a new bytes object with the contents of the old and new bytes objects.</context>
</line>
<line>
<line_number>70, 73, 74</line_number>
<line_content>return batch,
def datagen(args):,
'''Generate DiffusionDB dataset, and use CLIP model to generate image embeddings.'''</line_content>
<context>
The command line interface allows creating an archive from a directory containing Python code. It has options to specify the output file, Python interpreter to use in the shebang line, main function to call, whether to compress files, and to display</context>
</line>
<line>
<line_number>75, 77, 78</line_number>
<line_content>dataset = load_dataset('poloclub/diffusiondb', args.subset),
for b in dataset.map(,
generate_clip_embeddings, batched=True, batch_size=256, remove_columns=['image']</line_content>
<context>
For example, rgb_to_hsv can convert an (R, G, B) tuple to (H, S, V) and hsv_to_rgb does the reverse conversion. The colorsys module enables flexible color space conversions in Python.</context>
</line>
<line>
<line_number>79, 80, 81</line_number>
<line_content>)['train']:,
b['image'] = b['image_bytes'],
del b['image_bytes']</line_content>
<context>
Python encodes them into bytes using an encoding like UTF-8.</context>
</line>
<line>
<line_number>82, 83, 84</line_number>
<line_content>data.append(b),
tbl = pa.Table.from_pylist(data, schema=schema),
return tbl</line_content>
<context>
PyList_Insert inserts an item at a given index. PyList_Append adds an item to the end of a list.</context>
</line>
<line>
<line_number>87, 88, 89</line_number>
<line_content>def main():,
parser = ArgumentParser(),
parser.add_argument(</line_content>
<context>
The add_argument() method is used to register arguments with the parser. It allows specifying options like the type, default value, help text, and more. The parse_args() method then does the parsing and conversion to create the namespace object.</context>
</line>
<line>
<line_number>90, 92, 94</line_number>
<line_content>'-o', '--output', metavar='DIR', help='Output lance directory', required=True,
parser.add_argument(,
'--subset',</line_content>
<context>
- report(), report_partial_closure(), report_full_closure() - Print reports of the directory comparison.
- left, right - The left and right directories.
- common_dirs, common_files - Common subdirectories and files.</context>
</line>
<line>
<line_number>95, 96, 97</line_number>
<line_content>choices=['2m_all', '2m_first_10k', '2m_first_100k'],,
default='2m_first_10k',,
help='subset of the hg dataset',</line_content>
<context>
After parsing, opts contains a list of (option, value) tuples and args contains the remaining non-option arguments. These can then be processed in the script.</context>
</line>
<line>
<line_number>100, 102, 103</line_number>
<line_content>args = parser.parse_args(),
batches = datagen(args),
lance.write_dataset(batches, args.output)</line_content>
<context>
Overall, the argparse module provides powerful command-line parsing with features like positional arguments, optional arguments, help text generation, and argument conversion. The examples show many typical use cases for the module.
</context>
</line>
</file_context>
</file>
<file>
<file_path>docs/test/md_testing.py</file_path>
<file_content>
import glob
from typing import Iterator
from pathlib import Path
excluded_files = [
"../src/fts.md",
"../src/embedding.md",
"../src/examples/serverless_lancedb_with_s3_and_lambda.md",
"../src/examples/serverless_qa_bot_with_modal_and_langchain.md",
"../src/examples/youtube_transcript_bot_with_nodejs.md"
]
languages = ["py", "javascript"]
glob_string = "../src/**/*.md"
def yield_lines(lines: Iterator[str], prefix: str, suffix: str, languages: list):
current_language = {language: False for language in languages}
for line in lines:
for language in languages:
if line.strip().startswith(prefix + language):
current_language[language] = True
elif current_language[language] and line.strip().startswith(suffix):
current_language[language] = False
yield ("\n", language)
elif current_language[language]:
yield (line, language)
def create_code_files(prefix: str, suffix: str, file_ending: str = ""):
for file in filter(lambda file: file not in excluded_files, glob.glob(glob_string, recursive=True)):
with open(file, "r") as f:
lines = list(yield_lines(iter(f), prefix, suffix, languages))
python_lines = [line[0] for line in lines if line[1] == "py"]
node_lines = [line[0] for line in lines if line[1] == "javascript"]
if len(python_lines) > 0:
python_out_path = Path("python") / Path(file).name.strip(".md") / (Path(file).name.strip(".md") + file_ending + ".py")
python_out_path.parent.mkdir(exist_ok=True, parents=True)
with open(python_out_path, "w") as python_out:
python_out.writelines(python_lines)
if len(node_lines) > 0:
node_out_path = Path("node") / Path(file).name.strip(".md") / (Path(file).name.strip(".md") + file_ending + ".js")
node_out_path.parent.mkdir(exist_ok=True, parents=True)
with open(node_out_path, "w") as node_out:
node_out.write("(async () => {\n")
node_out.writelines(node_lines)
node_out.write("})();")
# Setup doc code
create_code_files("<!--", "-->", "-setup")
# Actual doc code
create_code_files("```", "```")
</file_content>
<file_context>
<line>
<line_number>0, 1, 2</line_number>
<line_content>import glob,
from typing import Iterator,
from pathlib import Path</line_content>
<context>
with '[]'. glob.glob() returns a list of matching pathnames, which can be absolute or relative paths. The glob.iglob() function returns an iterator instead of a list. The glob module uses os.scandir() and fnmatch.fnmatch() internally. Files starting</context>
</line>
<line>
<line_number>4, 5, 6</line_number>
<line_content>excluded_files = [,
'../src/fts.md',,
'../src/embedding.md',</line_content>
<context>
- include *.txt to add all .txt files
- recursive-include examples *.py to add all .py files recursively under examples/
- prune examples/tmp to exclude the examples/tmp directory</context>
</line>
<line>
<line_number>7, 8, 9</line_number>
<line_content>'../src/examples/serverless_lancedb_with_s3_and_lambda.md',,
'../src/examples/serverless_qa_bot_with_modal_and_langchain.md',,
'../src/examples/youtube_transcript_bot_with_nodejs.md'</line_content>
<context>
with the ext_modules option using Extension objects. Scripts are listed under the scripts option. Data files and extra files can also be included.</context>
</line>
<line>
<line_number>11, 12, 14</line_number>
<line_content>languages = ['py', 'javascript'],
glob_string = '../src/**/*.md',
def yield_lines(lines: Iterator[str], prefix: str, suffix: str, languages: list):</line_content>
<context>
Some examples of usage:
- Append a tr command to uppercase:
t.append('tr a-z A-Z', '--')
- Open a file-like object for writing text through the pipeline:
f = t.open('outfile', 'w')</context>
</line>
<line>
<line_number>15, 16, 17</line_number>
<line_content>current_language = {language: False for language in languages},
for line in lines:,
for language in languages:</line_content>
<context>
Python does not use braces for blocks in "if", "while", "for", etc because it was influenced by the ABC language which found it improved readability. The colon introduces the block and eliminates ambiguity about scope. Editors can also use the colon</context>
</line>
<line>
<line_number>18, 19, 20</line_number>
<line_content>if line.strip().startswith(prefix + language):,
current_language[language] = True,
elif current_language[language] and line.strip().startswith(suffix):</line_content>
<context>
returns true if Python is currently initialized.</context>
</line>
<line>
<line_number>21, 22, 23</line_number>
<line_content>current_language[language] = False,
yield ('\n', language),
elif current_language[language]:</line_content>
<context>
The language is named after Monty Python's Flying Circus. The tutorial invites you to play with the Python interpreter while reading to learn the language through examples. It covers basic language elements like expressions, data types, functions</context>
</line>
<line>
<line_number>24, 26, 27</line_number>
<line_content>yield (line, language),
def create_code_files(prefix: str, suffix: str, file_ending: str = ''):,
for file in filter(lambda file: file not in excluded_files, glob.glob(glob_string, recursive=True)):</line_content>
<context>
txt_files = fnmatch.filter(names, '*.txt')
So in summary, fnmatch provides simple Unix style filename matching functionality in Python. It can be useful for filtering lists of files and doing glob style matching.</context>
</line>
<line>
<line_number>28, 29, 30</line_number>
<line_content>with open(file, 'r') as f:,
lines = list(yield_lines(iter(f), prefix, suffix, languages)),
python_lines = [line[0] for line in lines if line[1] == 'py']</line_content>
<context>
A Python program is divided into logical lines terminated by NEWLINE tokens. Logical lines can span multiple physical lines using explicit line joining with backslashes or implicit line joining by enclosing expressions in</context>
</line>
<line>
<line_number>31, 33, 34</line_number>
<line_content>node_lines = [line[0] for line in lines if line[1] == 'javascript'],
if len(python_lines) > 0:,
python_out_path = Path('python') / Path(file).name.strip('.md') / (Path(file).name.strip('.md') + file_ending + '.py')</line_content>
<context>
A Python program is divided into logical lines terminated by NEWLINE tokens. Logical lines can span multiple physical lines using explicit line joining with backslashes or implicit line joining by enclosing expressions in</context>
</line>
<line>
<line_number>35, 36, 37</line_number>
<line_content>python_out_path.parent.mkdir(exist_ok=True, parents=True),
with open(python_out_path, 'w') as python_out:,
python_out.writelines(python_lines)</line_content>
<context>
The PYTHONPATH environment variable can add more directories to the search path.</context>
</line>
<line>
<line_number>39, 40, 41</line_number>
<line_content>if len(node_lines) > 0:,
node_out_path = Path('node') / Path(file).name.strip('.md') / (Path(file).name.strip('.md') + file_ending + '.js'),
node_out_path.parent.mkdir(exist_ok=True, parents=True)</line_content>
<context>
Methods like Path.exists(), Path.is_dir(), Path.is_file(), Path.open() allow querying properties of a filesystem path and interacting with the filesystem. Path.rmdir(), Path.unlink(), Path.rename(), and Path.replace() perform system calls to remove,</context>
</line>
<line>
<line_number>42, 43, 44</line_number>
<line_content>with open(node_out_path, 'w') as node_out:,
node_out.write('(async () => {\n'),
node_out.writelines(node_lines)</line_content>
<context>
The asyncio streams module provides high-level async/await-ready primitives to work with network connections and streams. The key functions are asyncio.open_connection(), asyncio.start_server(), asyncio.open_unix_connection(), and</context>
</line>
<line>
<line_number>45, 48, 51</line_number>
<line_content>node_out.write('})();'),
create_code_files('<!--', '-->', '-setup'),
create_code_files('```', '```')</line_content>
<context>
- Can output code directly into the file, or into separate buffers/files for cleaner code organization.
</context>
</line>
</file_context>
</file>
<file>
<file_path>python/lancedb/__init__.py</file_path>
<file_content>
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .db import URI, LanceDBConnection
def connect(uri: URI) -> LanceDBConnection:
"""Connect to a LanceDB instance at the given URI
Parameters
----------
uri: str or Path
The uri of the database.
Examples
--------
For a local directory, provide a path for the database:
>>> import lancedb
>>> db = lancedb.connect("~/.lancedb")
For object storage, use a URI prefix:
>>> db = lancedb.connect("s3://my-bucket/lancedb")
Returns
-------
conn : LanceDBConnection
A connection to a LanceDB database.
"""
return LanceDBConnection(uri)
</file_content>
<file_context>
<line>
<line_number>13, 16, 17</line_number>
<line_content>from .db import URI, LanceDBConnection,
def connect(uri: URI) -> LanceDBConnection:,
'''Connect to a LanceDB instance at the given URI</line_content>
<context>
The sqlite3 module provides a DB-API 2.0 compliant interface for working with SQLite databases in Python. It allows executing SQL statements and fetching results. Key components include the Connection, Cursor, and Row classes.</context>
</line>
<line>
<line_number>19, 20, 21</line_number>
<line_content>Parameters,
----------,
uri: str or Path</line_content>
<context>
- urlunparse() - Puts a parsed URL back together into a complete URL string. This is the inverse of urlparse().
- urlsplit() - Similar to urlparse() but doesn't split params and query.</context>
</line>
<line>
<line_number>22, 27, 29</line_number>
<line_content>The uri of the database.,
For a local directory, provide a path for the database:,
>>> import lancedb</line_content>
<context>
The sqlite3 module provides a DB-API 2.0 compliant interface for working with SQLite databases in Python. It allows executing SQL statements and fetching results. Key components include the Connection, Cursor, and Row classes.</context>
</line>
<line>
<line_number>30, 32, 34</line_number>
<line_content>>>> db = lancedb.connect('~/.lancedb'),
For object storage, use a URI prefix:,
>>> db = lancedb.connect('s3://my-bucket/lancedb')</line_content>
<context>
The sqlite3 module provides a DB-API 2.0 compliant interface for working with SQLite databases in Python. It allows executing SQL statements and fetching results. Key components include the Connection, Cursor, and Row classes.</context>
</line>
<line>
<line_number>38, 39, 41</line_number>
<line_content>conn : LanceDBConnection,
A connection to a LanceDB database.,
return LanceDBConnection(uri)</line_content>
<context>
A Connection represents a db connection. It can create Cursor objects to execute SQL statements. Connection provides methods like commit(), rollback(), close() for transaction control. The isolation_level attribute controls implicit transaction
</context>
</line>
</file_context>
</file>
<file>
<file_path>python/lancedb/common.py</file_path>
<file_content>
# Copyright 2023 LanceDB Developers
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from pathlib import Path
from typing import List, Union
import numpy as np
import pandas as pd
import pyarrow as pa
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]
URI = Union[str, Path]
# TODO support generator
DATA = Union[List[dict], dict, pd.DataFrame]
VECTOR_COLUMN_NAME = "vector"
</file_content>
<file_context>
<line>
<line_number>12, 13, 15</line_number>
<line_content>from pathlib import Path,
from typing import List, Union,
import numpy as np</line_content>
<context>
The Python standard library contains many built-in modules and functions that provide common functionality for Python programmers. It includes modules for file I/O, system access, data types like lists and dictionaries, text processing, networking,</context>
</line>
<line>
<line_number>16, 17, 19</line_number>
<line_content>import pandas as pd,
import pyarrow as pa,
VEC = Union[list, np.ndarray, pa.Array, pa.ChunkedArray]</line_content>
<context>
The python list data type has a built-in sort() method that sorts the list in-place. There is also a sorted() built-in function that builds a new sorted list from an iterable.</context>
</line>
<line>
<line_number>20, 23, 24</line_number>
<line_content>URI = Union[str, Path],
DATA = Union[List[dict], dict, pd.DataFrame],
VECTOR_COLUMN_NAME = 'vector'</line_content>
<context>
- urlencode() - Convert a dictionary into a urlencoded query string to be appended to a URL.
- parse_qs() and parse_qsl() - Parse query strings into Python data structures.
</context>
</line>
</file_context>
</file>
<file>
<file_path>python/lancedb/conftest.py</file_path>
<file_content>
import builtins
import os
import pytest
# import lancedb so we don't have to in every example
import lancedb
@pytest.fixture(autouse=True)
def doctest_setup(monkeypatch, tmpdir):
# disable color for doctests so we don't have to include
# escape codes in docstrings
monkeypatch.setitem(os.environ, "NO_COLOR", "1")