-
Notifications
You must be signed in to change notification settings - Fork 1
/
MergeTrainFiles.java
110 lines (93 loc) · 2.87 KB
/
MergeTrainFiles.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import java.io.*;
import java.util.*;
import java.util.zip.*;
public class MergeTrainFiles
{
public static void main(String[] args) throws IOException
{
//String szprefixchrom = "train_samples_"+args[0];
//int nbatch = Integer.parseInt(args[1]);
//String szelset = args[2];
//String szelset = "gz_"+args[2]+".gz.txt";
String sztrainfile = args[0];
String szfeaturelistdir = Constants.TRAINDIRFEATURES;//args[1];
String szoutputdir = args[1];
File dir = new File(szoutputdir);
if (!dir.exists())
{
if (!dir.mkdirs())
{
throw new IllegalArgumentException(szoutputdir+" does not exist and could not be created!");
}
}
String szLine;
BufferedReader brfeaturefile = Util.getBufferedReader(szfeaturelistdir);
int nfeatureline = 0;
while ((szLine = brfeaturefile.readLine())!=null)
{
if (!szLine.trim().equals(""))
{
nfeatureline++;
}
}
brfeaturefile.close();
String[] traindir = new String[nfeatureline];
String[] featurefiles = new String[nfeatureline];
brfeaturefile = Util.getBufferedReader(szfeaturelistdir);
nfeatureline = 0;
while ((szLine = brfeaturefile.readLine())!=null)
{
if (!szLine.trim().equals(""))
{
StringTokenizer st = new StringTokenizer(szLine,"\t");
traindir[nfeatureline] = st.nextToken();
featurefiles[nfeatureline] = st.nextToken();
nfeatureline++;
}
}
brfeaturefile.close();
BufferedReader[] brA = new BufferedReader[traindir.length];
for (int ndir = 0; ndir < brA.length; ndir++)
{
brA[ndir] = Util.getBufferedReader(traindir[ndir]+"/"+sztrainfile);//szprefixchrom+"_"+nbatch+"."+szelset+".gz");
}
//String szLine;
int[] offsetA = new int[featurefiles.length];
for (int ndir = 0; ndir < featurefiles.length-1; ndir++)
{
int nlinecount = 0;
BufferedReader brfeature = Util.getBufferedReader(featurefiles[ndir]);
while ((szLine = brfeature.readLine())!=null)
{
nlinecount++;
}
offsetA[ndir+1] = nlinecount+offsetA[ndir];
brfeature.close();
}
GZIPOutputStream pw = new GZIPOutputStream(new FileOutputStream(szoutputdir+"/"+sztrainfile));//szprefixchrom+"_"+nbatch+"."+szelset+".gz"));
int noffset = 0;
while ((szLine = brA[0].readLine())!=null)
{
StringBuffer sb = new StringBuffer(szLine);
for (int nfile = 1; nfile < brA.length; nfile++)
{
szLine = brA[nfile].readLine();
StringTokenizer st = new StringTokenizer(szLine," ");
st.nextToken(); //skip label
while (st.hasMoreTokens())
{
StringTokenizer stcolon = new StringTokenizer(st.nextToken(),":");
sb.append(" "+(Integer.parseInt(stcolon.nextToken())+offsetA[nfile])+":1");
}
}
byte[] btformat =(sb.toString()+"\n").getBytes();
pw.write(btformat,0,btformat.length);
}
pw.finish();
pw.close();
for (int ndir = 0; ndir < brA.length; ndir++)
{
brA[ndir].close();
}
}
}