diff --git a/commands.sh b/commands.sh index ed836fb..f7d369b 100644 --- a/commands.sh +++ b/commands.sh @@ -1,41 +1,42 @@ #!/bin/sh -java -Xmx80g -jar graphlod-0.1.jar --excludedNamespaces \ +java -Xmx80g -jar graphlod-0.1.jar --name drugbank --excludedNamespaces \ "http://www4.wiwiss.fu-berlin.de/drugbank/resource/drugtype/" \ "http://www4.wiwiss.fu-berlin.de/drugbank/resource/references/" \ --skipChromatic \ --namespace "http://www4.wiwiss.fu-berlin.de/drugbank/" \ /data/graphlod/drugbank/drugbank.nt | tee drugbank.txt -java -Xmx80g -jar graphlod-0.1.jar --skipChromatic \ +java -Xmx80g -jar graphlod-0.1.jar --name dailymed --skipChromatic \ --namespace "http://www4.wiwiss.fu-berlin.de/dailymed/" \ /data/graphlod/dailymed/dailymed_dump.nt | tee dailymed_dump.txt -java -Xmx80g -jar graphlod-0.1.jar --skipChromatic --excludedNamespaces \ +java -Xmx80g -jar graphlod-0.1.jar --name diseasome --skipChromatic --excludedNamespaces \ "http://www4.wiwiss.fu-berlin.de/diseasome/resource/diseaseClass/" \ --namespace "http://www4.wiwiss.fu-berlin.de/diseasome/" \ /data/graphlod/diseasome/diseasome.nt | tee diseasome.txt -java -Xmx80g -jar graphlod-0.1.jar --skipChromatic \ +java -Xmx80g -jar graphlod-0.1.jar --name dbpedia_person --skipChromatic \ --namespace "http://dbpedia.org/resource" \ /data/graphlod/dbpedia/persondata_en.nt | tee dbpedia_persondata.txt -java -Xmx80g -jar graphlod-0.1.jar --skipChromatic \ +java -Xmx80g -jar graphlod-0.1.jar --name dbpedia_geo_coordinate --skipChromatic \ --namespace "http://dbpedia.org/resource" \ /data/graphlod/dbpedia/geo_coordinates_en.nt | tee geo_coordinate.txt -java -Xmx80g -jar graphlod-0.1.jar --skipChromatic \ +java -Xmx80g -jar graphlod-0.1.jar --name dbpedia_homepages --skipChromatic \ --namespace "http://dbpedia.org/resource" \ /data/graphlod/dbpedia/homepages_en.nt | tee dbpedia_homepages.txt # fix mapping: sed 's/"\.$/" \./' mappingbased_properties_en.nt > mappingbased_properties_en_fixed.nt -java -Xmx100g -jar graphlod-0.1.jar --skipChromatic \ +java -Xmx100g -jar graphlod-0.1.jar --name dbpedia_mapping --skipChromatic \ --namespace "http://dbpedia.org/resource" \ mappingbased_properties_en_fixed.nt | tee dbpedia_mapping.txt -java -Xmx80g -jar graphlod-0.1.jar --skipChromatic \ +java -Xmx80g -jar graphlod-0.1.jar --name linkedgeodata --skipChromatic \ /data/graphlod/linkedgeodata/2013-04-29-{Ae*,C*,E*,Mili*,H*,P*,S*,T*} \ --namespace "http://linkedgeodata.org/" \ | tee linkedgeodata.txt +zip result.txt *.txt *.csv diff --git a/pom.xml b/pom.xml index d6c8962..ee63ba7 100644 --- a/pom.xml +++ b/pom.xml @@ -55,6 +55,11 @@ commons-lang3 3.3.2 + + org.apache.commons + commons-csv + 1.0 + com.google.guava guava diff --git a/src/graphlod/CollectionUtils.java b/src/graphlod/CollectionUtils.java index 40caae5..08ec9a8 100644 --- a/src/graphlod/CollectionUtils.java +++ b/src/graphlod/CollectionUtils.java @@ -31,7 +31,7 @@ public static > T min(Collection collection) { } T result = collection.iterator().next(); for (T element : collection) { - result = result.compareTo(element) > 0 ? result : element; + result = result.compareTo(element) < 0 ? result : element; } return result; } @@ -42,7 +42,7 @@ public static > T max(Collection collection) { } T result = collection.iterator().next(); for (T element : collection) { - result = result.compareTo(element) < 0 ? result : element; + result = result.compareTo(element) > 0 ? result : element; } return result; } diff --git a/src/graphlod/Dataset.java b/src/graphlod/Dataset.java index 729b8dd..1257b9f 100644 --- a/src/graphlod/Dataset.java +++ b/src/graphlod/Dataset.java @@ -72,7 +72,7 @@ private void readTriples(NxParser nxp) { } if (subjectUri.equals(objectUri)) { - continue; // TODO: why that? + continue; } if (propertyUri.equals("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")) { diff --git a/src/graphlod/GraphCsvOutput.java b/src/graphlod/GraphCsvOutput.java new file mode 100644 index 0000000..154b3e9 --- /dev/null +++ b/src/graphlod/GraphCsvOutput.java @@ -0,0 +1,53 @@ +package graphlod; + + +import java.io.IOException; +import java.io.Writer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Set; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; + +import com.google.common.base.Charsets; +import com.google.common.base.Verify; + +public class GraphCsvOutput { + + private final CSVPrinter writer; + private final int maxSizeForDiameter; + + public GraphCsvOutput(String name, int maxSizeForDiameter) { + this.maxSizeForDiameter = maxSizeForDiameter; + Writer out; + try { + Path path = Paths.get(name + "_graphs.csv"); + out = Files.newBufferedWriter(path, Charsets.UTF_8); + writer = CSVFormat.DEFAULT.withHeader("graph", "vertices", "edges", "diameter", "avgindegree", "maxindegree", "avgoutdegree", "maxoutdegree").print(out); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void writeGraph(GraphFeatures graph) { + double diameter = graph.getVertexCount() < maxSizeForDiameter ? graph.getDiameter() : -1; + try { + writer.printRecord(graph.getId(), graph.getVertexCount(), graph.getEdgeCount(), diameter, + CollectionUtils.average(graph.getIndegrees()), CollectionUtils.max(graph.getIndegrees()), + CollectionUtils.average(graph.getOutdegrees()), CollectionUtils.max(graph.getOutdegrees())); + } catch (IOException e) { + e.printStackTrace(); + } + } + public void close() { + try { + writer.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/src/graphlod/GraphFeatures.java b/src/graphlod/GraphFeatures.java index dca5bb0..dd11ed1 100644 --- a/src/graphlod/GraphFeatures.java +++ b/src/graphlod/GraphFeatures.java @@ -30,8 +30,10 @@ public class GraphFeatures { private Set vertices; private final Set edges; private AsUndirectedGraph undirectedG; + private String id; - public GraphFeatures(DirectedGraph graph) { + public GraphFeatures(String id, DirectedGraph graph) { + this.id = id; this.graph = graph; this.vertices = this.graph.vertexSet(); this.edges = this.graph.edgeSet(); @@ -79,7 +81,7 @@ public List getConnectedSubGraphFeatures(float minSize) { return Collections.emptyList(); } List connectedSubgraphFeatures = new ArrayList<>(); - + int i = 0; for (Set set : sets) { if (set.size() < minSize) { continue; @@ -94,7 +96,8 @@ public List getConnectedSubGraphFeatures(float minSize) { subgraph.addEdge(vertex, (String) edge.getTarget(), edge); } } - connectedSubgraphFeatures.add(new GraphFeatures(subgraph)); + connectedSubgraphFeatures.add(new GraphFeatures("subgraph" + i, subgraph)); + i++; } Collections.sort(connectedSubgraphFeatures, new Comparator() { @Override @@ -132,6 +135,13 @@ public List getIndegrees() { return this.indegrees; } + public List getIndegrees2() { + if(this.indegrees2 == null) { + getIndegrees(); + } + return this.indegrees2; + } + public List getOutdegrees() { if (this.outdegrees == null) { this.outdegrees = new ArrayList<>(); @@ -145,6 +155,14 @@ public List getOutdegrees() { return this.outdegrees; } + + public List getOutdegrees2() { + if(this.outdegrees2 == null) { + getOutdegrees(); + } + return this.outdegrees2; + } + public ArrayList getEdgeCounts() { ArrayList edgeCounts = new ArrayList<>(); for (String vertex : this.vertices) { @@ -165,6 +183,14 @@ public int getChromaticNumber() { return ChromaticNumber.findGreedyChromaticNumber(this.undirectedG); } + public Set getVertices() { + return vertices; + } + + public String getId() { + return id; + } + static class Degree implements Comparable { public String vertex; public int degree; diff --git a/src/graphlod/GraphLOD.java b/src/graphlod/GraphLOD.java index fad48ff..9ea6018 100644 --- a/src/graphlod/GraphLOD.java +++ b/src/graphlod/GraphLOD.java @@ -28,11 +28,17 @@ public class GraphLOD { private static final Logger logger = Logger.getLogger(GraphLOD.class); public static final int MAX_SIZE_FOR_DIAMETER = 500; - public GraphLOD(Collection datasetFiles, boolean skipChromaticNumber, String namespace, Collection excludedNamespaces, float minImportantSubgraphSize, int importantDegreeCount) { + public GraphCsvOutput graphCsvOutput; + public VertexCsvOutput vertexCsvOutput; + + public GraphLOD(String name, Collection datasetFiles, boolean skipChromaticNumber, String namespace, Collection excludedNamespaces, float minImportantSubgraphSize, int importantDegreeCount) { + graphCsvOutput = new GraphCsvOutput(name, MAX_SIZE_FOR_DIAMETER); + vertexCsvOutput = new VertexCsvOutput(name); + Stopwatch sw = Stopwatch.createStarted(); Dataset dataset = Dataset.fromFiles(datasetFiles, namespace, excludedNamespaces); - GraphFeatures graphFeatures = new GraphFeatures(dataset.getGraph()); + GraphFeatures graphFeatures = new GraphFeatures("main_graph", dataset.getGraph()); System.out.println("Loading the dataset took " + sw + " to execute."); @@ -85,6 +91,7 @@ public GraphLOD(Collection datasetFiles, boolean skipChromaticNumber, St System.out.printf("Subgraph: %s vertices\n", subGraph.getVertexCount()); analyzeConnectedGraph(subGraph, importantDegreeCount); } + System.out.println("Analysing the subgraphs took " + sw + " to execute."); } @@ -109,6 +116,8 @@ public GraphLOD(Collection datasetFiles, boolean skipChromaticNumber, St System.out.println("Chromatic Number: " + cN); System.out.println("Getting the Chromatic Number took " + sw + " to execute."); } + graphCsvOutput.close(); + vertexCsvOutput.close(); } private void analyzeConnectedGraph(GraphFeatures graph, int importantDegreeCount) { @@ -118,6 +127,8 @@ private void analyzeConnectedGraph(GraphFeatures graph, int importantDegreeCount } else { System.out.println("\tGraph too big to show diameter"); } + graphCsvOutput.writeGraph(graph); + vertexCsvOutput.writeGraph(graph); System.out.println("\thighest indegrees:"); System.out.println("\t\t" + StringUtils.join(graph.maxInDegrees(importantDegreeCount), "\n\t\t")); @@ -133,6 +144,7 @@ public static void main(final String[] args) { ArgumentParser parser = ArgumentParsers.newArgumentParser("GraphLOD") .defaultHelp(true).description("calculates graph features."); parser.addArgument("dataset").nargs("+").setDefault(Arrays.asList(DEFAULT_DATASET_LOCATION)); + parser.addArgument("--name").type(String.class).setDefault(""); parser.addArgument("--namespace").type(String.class).setDefault(""); parser.addArgument("--excludedNamespaces").nargs("*").setDefault(Collections.emptyList()); parser.addArgument("--skipChromatic").action(Arguments.storeTrue()); @@ -145,7 +157,12 @@ public static void main(final String[] args) { parser.handleError(e); System.exit(1); } + List dataset = result.getList("dataset"); + String name = result.getString("name"); + if(name.isEmpty()) { + name = dataset.get(0); + } String namespace = result.getString("namespace"); List excludedNamespaces = result.getList("excludedNamespaces"); boolean skipChromatic = result.getBoolean("skipChromatic"); @@ -153,6 +170,7 @@ public static void main(final String[] args) { int importantDegreeCount = result.getInt("importantDegreeCount"); System.out.println("reading: " + dataset); + System.out.println("name: " + name); System.out.println("namespace: " + namespace); System.out.println("skip chromatic: " + skipChromatic); System.out.println("excluded namespaces: " + excludedNamespaces); @@ -162,7 +180,7 @@ public static void main(final String[] args) { Locale.setDefault(Locale.US); - new GraphLOD(dataset, skipChromatic, namespace, excludedNamespaces, minImportantSubgraphSize, importantDegreeCount); + new GraphLOD(name, dataset, skipChromatic, namespace, excludedNamespaces, minImportantSubgraphSize, importantDegreeCount); } } diff --git a/src/graphlod/VertexCsvOutput.java b/src/graphlod/VertexCsvOutput.java new file mode 100644 index 0000000..4c82dd8 --- /dev/null +++ b/src/graphlod/VertexCsvOutput.java @@ -0,0 +1,55 @@ +package graphlod; + + +import java.io.IOException; +import java.io.Writer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; + +import com.google.common.base.Charsets; +import com.google.common.base.Verify; + +public class VertexCsvOutput { + + CSVPrinter writer; + + public VertexCsvOutput(String name) { + Writer out; + try { + Path path = Paths.get(name + "_vertices.csv"); + out = Files.newBufferedWriter(path, Charsets.UTF_8); + writer = CSVFormat.DEFAULT.withHeader("graph", "vertex", "indegree", "outdegree").print(out); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void writeGraph(GraphFeatures graph) { + List inDegrees = graph.getIndegrees2(); + List outDegrees = graph.getIndegrees2(); + + for (int i = 0; i < inDegrees.size(); i++) { + GraphFeatures.Degree in = inDegrees.get(i); + GraphFeatures.Degree out = outDegrees.get(i); + Verify.verify(in.vertex.equals(out.vertex)); + try { + writer.printRecord(graph.getId(), in.vertex, in.degree, out.degree); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + public void close() { + try { + writer.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + +} diff --git a/test/graphlod/CollectionUtilsTest.java b/test/graphlod/CollectionUtilsTest.java index 7daa737..1a680f7 100644 --- a/test/graphlod/CollectionUtilsTest.java +++ b/test/graphlod/CollectionUtilsTest.java @@ -1,11 +1,13 @@ package graphlod; +import java.util.Arrays; import java.util.Collection; import org.junit.Test; import com.google.common.collect.Lists; +import static org.hamcrest.Matchers.is; import static org.junit.Assert.*; import static org.hamcrest.Matchers.contains; import static org.hamcrest.Matchers.containsInAnyOrder; @@ -16,4 +18,17 @@ public class CollectionUtilsTest { public void testMaxValues() throws Exception { assertThat(CollectionUtils.maxValues(Lists.newArrayList(5,2,9,2), 2), contains(5,9)); } + + @Test + public void testMax() { + assertThat(CollectionUtils.max(Arrays.asList(1, 5, 3)), is(5)); + } + @Test + public void testMin() { + assertThat(CollectionUtils.min(Arrays.asList(1, 5, 3)), is(1)); + } + @Test + public void testAvg() { + assertThat(CollectionUtils.average(Arrays.asList(1, 5, 3)), is(3.0)); + } } \ No newline at end of file diff --git a/test/graphlod/GraphFeaturesTest.java b/test/graphlod/GraphFeaturesTest.java index 3a47370..eb9f933 100644 --- a/test/graphlod/GraphFeaturesTest.java +++ b/test/graphlod/GraphFeaturesTest.java @@ -29,7 +29,7 @@ public void setup() { createStatement("b", "p1", "d"), createStatement("d", "p1", "b"), createStatement("c", "p1", "e")), "", new ArrayList()); - features = new GraphFeatures(ds.getGraph()); + features = new GraphFeatures("" , ds.getGraph()); } diff --git a/test/graphlod/GraphFeaturesTest2.java b/test/graphlod/GraphFeaturesTest2.java index 6fce48b..1cd6afc 100644 --- a/test/graphlod/GraphFeaturesTest2.java +++ b/test/graphlod/GraphFeaturesTest2.java @@ -35,7 +35,7 @@ public void setup() { createStatement("c", "p1", "d"), createStatement("d", "p1", "e"), createStatement("e", "p1", "c")), "", new ArrayList()); - features = new GraphFeatures(ds.getGraph()); + features = new GraphFeatures("", ds.getGraph()); } @Test