Skip to content

Commit

Permalink
#1 initial data typer
Browse files Browse the repository at this point in the history
  • Loading branch information
michael-conway committed Dec 28, 2017
1 parent 5289b69 commit 3595a89
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
*/
package org.irodsext.dataprofiler;

import static org.junit.Assert.fail;

import java.util.Properties;

import org.irods.jargon.core.connection.IRODSAccount;
import org.irods.jargon.core.connection.JargonProperties;
import org.irods.jargon.core.connection.SettableJargonProperties;
import org.irods.jargon.core.pub.IRODSAccessObjectFactory;
import org.irods.jargon.core.pub.IRODSFileSystem;
import org.irods.jargon.extensions.dataprofiler.DataProfilerSettings;
import org.irods.jargon.testutils.IRODSTestSetupUtilities;
import org.irods.jargon.testutils.TestingPropertiesHelper;
import org.irods.jargon.testutils.filemanip.ScratchFileUtils;
Expand Down Expand Up @@ -69,8 +70,21 @@ public void afterEach() throws Exception {
* {@link org.irodsext.dataprofiler.IrodsextDataProfilerService#retrieveDataProfile(java.lang.String, org.irods.jargon.extensions.dataprofiler.DataProfilerSettings)}.
*/
@Test
public void testRetrieveDataProfileStringDataProfilerSettings() {
fail("Not yet implemented");
public void testBasicDataProfileWithCollection() throws Exception {
IRODSAccount irodsAccount = testingPropertiesHelper.buildIRODSAccountFromTestProperties(testingProperties);
IRODSAccessObjectFactory accessObjectFactory = irodsFileSystem.getIRODSAccessObjectFactory();
String targetIrodsCollection = testingPropertiesHelper
.buildIRODSCollectionAbsolutePathFromTestProperties(testingProperties, IRODS_TEST_SUBDIR_PATH);

DataProfilerSettings dataProfilerSettings = new DataProfilerSettings();
dataProfilerSettings.setDetectMimeAndInfoType(false);
dataProfilerSettings.setRetrieveAcls(true);
dataProfilerSettings.setRetrieveMetadata(true);
dataProfilerSettings.setRetrieveReplicas(false);
dataProfilerSettings.setRetrieveShared(false);
dataProfilerSettings.setRetrieveStarred(false);
dataProfilerSettings.setRetrieveTickets(false);

}

}
4 changes: 4 additions & 0 deletions irodsext-data-typer/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
<artifactId>jargon-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
</dependency>
</dependencies>
<description>Tools for managing data type/format recognition</description>
<build>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/**
*
*/
package org.irodsext.datatyper;

import java.io.IOException;

import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.irods.jargon.core.connection.IRODSAccount;
import org.irods.jargon.core.exception.DataNotFoundException;
import org.irods.jargon.core.exception.JargonException;
import org.irods.jargon.core.pub.IRODSAccessObjectFactory;
import org.irods.jargon.core.utils.LocalFileUtils;
import org.irods.jargon.core.utils.MiscIRODSUtils;
import org.irods.jargon.extensions.datatyper.DataType;
import org.irods.jargon.extensions.datatyper.DataTypeResolutionService;
import org.irods.jargon.extensions.datatyper.DataTyperSettings;
import org.irods.jargon.extensions.datatyper.IrodsMimeTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Data type resolution service to determine MIME and info types of a file. Note
* that this is at first a very basic service that will need to evolve over
* time.
*
* @author Mike Conway - NIEHS
*
*/
public class IrodsextDataTypeResolutionService extends DataTypeResolutionService {

public static final Logger log = LoggerFactory.getLogger(IrodsextDataTypeResolutionService.class);

public IrodsextDataTypeResolutionService(IRODSAccessObjectFactory irodsAccessObjectFactory,
IRODSAccount irodsAccount, DataTyperSettings dataTyperSettings) {
super(irodsAccessObjectFactory, irodsAccount, dataTyperSettings);
}

@Override
public DataType resolveDataType(String irodsAbsolutePath) throws DataNotFoundException, JargonException {
log.info("resolveDataType()");

if (irodsAbsolutePath == null || irodsAbsolutePath.isEmpty()) {
throw new IllegalArgumentException("null or empty irodsAbsolutePath");
}

log.info("irodsAbsolutePath:{}", irodsAbsolutePath);

if (this.getDefaultDataTyperSettings().isDetailedDetermination()) {
log.warn("detailedDetermination not yet implemented, will default to check of file path");
}

log.info("checking for known irods types - interim code...");

String mimeType = determimeMimeTypeOfIrodsObjects(irodsAbsolutePath);

log.info("use Tika to derive based on file extenstion");

if (mimeType == null) {
log.info("not a known irods type, try tika");
mimeType = determineMimeTypeViaTika(irodsAbsolutePath);
}

if (mimeType == null) {
log.info("no mime type found via tika");
mimeType = "";
}

DataType dataType = new DataType();
dataType.setMimeType(mimeType);
log.info("dataType:{}", dataType);
return dataType;

}

private String determineMimeTypeViaTika(String irodsAbsolutePath) throws JargonException {
AutoDetectParser parser = new AutoDetectParser();
Detector detector = parser.getDetector();
Metadata md = new Metadata();
String fileName = MiscIRODSUtils.getLastPathComponentForGivenAbsolutePath(irodsAbsolutePath);

md.add(Metadata.RESOURCE_NAME_KEY, fileName);
MediaType mediaType;
try {
mediaType = detector.detect(null, md);
} catch (IOException e) {
throw new JargonException("io exception determining file type by extension", e);
}
return mediaType.toString();
}

@Override
public DataType resolveDataType(String irodsAbsolutePath, DataTyperSettings dataTyperSettings)
throws DataNotFoundException, JargonException {
// TODO Auto-generated method stub
return null;
}

/**
* front-load detection of special irods file types
*
* @param dataObject
* @return
*/
private String determimeMimeTypeOfIrodsObjects(final String irodsAbsolutePath) {

String extension = LocalFileUtils.getFileExtension(irodsAbsolutePath);
if (extension == null || extension.isEmpty()) {
return null;
}

if (extension.equals(".r")) {
log.info("irods rule detected in:{}", irodsAbsolutePath);
return IrodsMimeTypes.APPLICATION_IRODS_RULE;
} else {
return null;
}

}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

/**
* Default data typer service to determine MIME and info types of a file
*
* @author Mike Conway - NIEHS
*
*/
package org.irodsext.datatyper;
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package org.irodsext.datatyper;

import org.irods.jargon.core.connection.IRODSAccount;
import org.irods.jargon.core.pub.IRODSAccessObjectFactory;
import org.irods.jargon.extensions.datatyper.DataType;
import org.irods.jargon.extensions.datatyper.DataTypeResolutionService;
import org.irods.jargon.extensions.datatyper.DataTyperSettings;
import org.junit.Assert;
import org.junit.Test;
import org.mockito.Mockito;

public class IrodsextDataTypeResolutionServiceTest {

@Test
public void testResolveDataTypeString() throws Exception {
String testName = "/a/path/file.txt";
IRODSAccount dummyAccount = Mockito.mock(IRODSAccount.class);
IRODSAccessObjectFactory irodsAccessObjectFactory = Mockito.mock(IRODSAccessObjectFactory.class);
DataTyperSettings dataTyperSettings = new DataTyperSettings();
dataTyperSettings.setDetailedDetermination(false);
dataTyperSettings.setPersistDataTypes(false);
DataTypeResolutionService dtrs = new IrodsextDataTypeResolutionService(irodsAccessObjectFactory, dummyAccount,
dataTyperSettings);
DataType actual = dtrs.resolveDataType(testName);
Assert.assertNotNull("no type returned", actual);
Assert.assertEquals("text/plain", actual.getMimeType());

}

}

0 comments on commit 3595a89

Please sign in to comment.