Java Application for Batch Processing FEDORA Objects
We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.
I’m posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We’ll probably need to do things like this again…
/**********************************************************************************
*
* Copyright (C) 2006 OhioLINK
*
* This file is part of the OhioLINK Digital Resource Commons (DRC) Project.
*
* The OhioLINK DRC is free software; you can redistribute it and/or
* modify it under the terms of the Affero General Public License as
* published by Affero, Inc. -- either version 1 of the License, or
* (at your option) any later version.
*
* The OhioLINK DRC Project is distributed in the hope that it will be
* useful, but WITHOUT ANY WARRANTY -- without even the implied warranty
* of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Affero General Public License for more details.
*
* You should have received a copy of the Affero General Public
* License in the LICENSE.txt file that comes with the DRC project;
* if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd,
* Suite 300, Columbus, OH 43221, USA.
*********************************************************************************/
package batch;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import fedora.client.FedoraClient;
import fedora.server.access.FedoraAPIA;
import fedora.server.management.FedoraAPIM;
import fedora.server.types.gen.DatastreamDef;
import fedora.server.types.gen.MIMETypedStream;
public class Batch {
public static void main(String[] args) {
for (int i = 80; i < 81; i++) {
// "hdl" is our FEDORA PID prefix
String pid = "hdl:" + i;
try {
FedoraClient client = new FedoraClient(
"http://fedora.server/fedora",
"fedoraAdmin", "password");
FedoraAPIA apia = client.getAPIA();
FedoraAPIM apim = client.getAPIM();
//
// Get the list of datastreams for this object. For each one, we're
// going to look for an identifier that ends in "etd"
DatastreamDef[] datastreams = apia.listDatastreams(pid, null);
for (int j = 0; j < datastreams.length; j++) {
DatastreamDef def = datastreams[j];
String itemId = def.getID();
if (itemId.endsWith("etd")) {
//
// If we've found it, get it out of the FEDORA server and
// create a XML DOM document for it
MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null);
byte[] file = ds.getStream();
InputStream inputStream = new ByteArrayInputStream(file);
// String fileStr = new String(file, "ascii");
// System.out.println(fileStr);
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
Document sourceDoc = builder.parse(inputStream);
//
// Now build an empty XML DOM document for the Dublin Core
Document destDoc = builder.newDocument();
Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc");
rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/");
rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/");
destDoc.appendChild(rootElement);
//
// Now copy the values from the ETD XML document into
// the DC XML document
Element e; String value;
e=destDoc.createElement("dc:identifier");
e.appendChild(destDoc.createTextNode(pid));
rootElement.appendChild(e);
e=destDoc.createElement("dc:title");
value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
e.appendChild(destDoc.createTextNode(value));
rootElement.appendChild(e);
// author's name comes in many parts; this'll put them together
e = destDoc.createElement("dc:creator");
String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"};
String author = new String();
for (String field : nameFields) {
value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
if (value != null && !value.equals("")) {
author = author.concat(value).concat(" ");
}
}
e.appendChild(destDoc.createTextNode(author.trim()));
rootElement.appendChild(e);
e=destDoc.createElement("dc:language");
value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
e.appendChild(destDoc.createTextNode(value));
rootElement.appendChild(e);
e=destDoc.createElement("dc:description");
value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
e.appendChild(destDoc.createTextNode(value));
rootElement.appendChild(e);
e=destDoc.createElement("dc:date");
value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
e.appendChild(destDoc.createTextNode(value));
rootElement.appendChild(e);
e=destDoc.createElement("dc:subject");
value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
e.appendChild(destDoc.createTextNode(value));
rootElement.appendChild(e);
//
// Use a Transformer for output
TransformerFactory tFactory = TransformerFactory.newInstance();
Transformer transformer = tFactory.newTransformer();
transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes");
DOMSource source = new DOMSource(destDoc);
StringWriter strWriter = new StringWriter();
StreamResult result = new StreamResult(strWriter);
transformer.transform(source, result);
String xmlAsString=strWriter.getBuffer().toString();
// System.out.println(xmlAsString);
byte[] normalarr=xmlAsString.getBytes("UTF-8");
//
// Lastly, write the modified DC datastream back to the FEDORA server
apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false);
}
}
} catch (MalformedURLException e) {
System.out.println(pid+" "+e.getLocalizedMessage());
} catch (Exception e) {
System.out.println(pid+" "+e.getLocalizedMessage());
}
}
}
}