-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_write.py
50 lines (45 loc) · 2.16 KB
/
read_write.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!usr/bin/env python3
# -*- coding:utf-8 -*-
from collections import deque
import os,generate_file,time
def split_records(memory):
'''将内存中的数据拆分成tuple数组'''
memory = memory.split('\n')
records = deque()
for item in memory:
if len(item) != 0: # 因为分隔符为\n,拆分后的最后一条记录可能为空
item = item.split()
records.append((int(item[0]),item[1])) # 构建tuple存储原数据,方便后续排序
return records
def check_target_directory():
'''检查目标文件夹是否存在:
若目标文件夹不存在,则自动创建;
若已存在同名文件,则将文件删除后再创建同名文件夹;
删除目标文件下的的所有文件(需要重新生成)'''
path = os.path.join(os.path.abspath('.'),generate_file.FILE_PATH)
if os.path.exists(path) and os.path.isfile(path): # 若存在名为sub_file的文件,需要删除并创建同名文件夹
os.remove(path)
os.makedirs(path)
elif not os.path.exists(path): # 若sub_file文件夹不存在则自动创建
os.makedirs(path)
file_list = os.listdir(path)
for file in file_list:
os.remove(os.path.join(path,file))
if __name__ == '__main__':
check_target_directory()
print('开始处理文件(每次读取50M文件,并进行归并排序)')
start = time.time()
with open('file.dat','rb') as f:
count = 1
while True:
memory = f.read(generate_file.MEMORY_SIZE).decode('utf-8') # 一次读取50MB到内存中
if memory != '' and memory != '\n':
records = sorted(split_records(memory)) # 使用工具类进行快速排序
records = ["%s %s\n" % (x[0],x[1]) for x in records] # 将tuple数组转换为字符串数组
file_name = os.path.join(os.path.abspath(generate_file.FILE_PATH),'sub_file_' + str(count) + '.dat')
with open(file_name,'wb') as w:
w.write(''.join(records).encode('utf-8'))
count += 1
else:
break
print('所有文件已生成,总耗时:%fs' % (time.time() - start))