Java Application for Batch Processing FEDORA Objects

We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.

I’m posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We’ll probably need to do things like this again…

/**********************************************************************************
 *
 * Copyright (C) 2006 OhioLINK
 *
 * This file is part of the OhioLINK Digital Resource Commons (DRC) Project.
 *
 * The OhioLINK DRC is free software; you can redistribute it and/or
 * modify it under the terms of the Affero General Public License as
 * published by Affero, Inc. -- either version 1 of the License, or
 * (at your option) any later version.
 *
 * The OhioLINK DRC Project is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY -- without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Affero General Public License for more details.
 *
 * You should have received a copy of the Affero General Public
 * License in the LICENSE.txt file that comes with the DRC project;
 * if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd,
 * Suite 300, Columbus, OH 43221, USA.
 *********************************************************************************/
 
package batch;
 
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;
 
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
 
import org.w3c.dom.Document;
import org.w3c.dom.Element;
 
import fedora.client.FedoraClient;
import fedora.server.access.FedoraAPIA;
import fedora.server.management.FedoraAPIM;
import fedora.server.types.gen.DatastreamDef;
import fedora.server.types.gen.MIMETypedStream;
 
public class Batch {
 
 public static void main(String[] args) {
  for (int i = 80; i < 81; i++) {
   // "hdl" is our FEDORA PID prefix
   String pid = "hdl:" + i;
 
   try {
    FedoraClient client = new FedoraClient(
      "http://fedora.server/fedora",
      "fedoraAdmin", "password");
    FedoraAPIA apia = client.getAPIA();
    FedoraAPIM apim = client.getAPIM();
 
    //
    // Get the list of datastreams for this object.  For each one, we're
    // going to look for an identifier that ends in "etd"
    DatastreamDef[] datastreams = apia.listDatastreams(pid, null);
    for (int j = 0; j < datastreams.length; j++) {
     DatastreamDef def = datastreams[j];
     String itemId = def.getID();
     if (itemId.endsWith("etd")) {
 
      //
      // If we've found it, get it out of the FEDORA server and
      // create a XML DOM document for it
      MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null);
      byte[] file = ds.getStream();
      InputStream inputStream = new ByteArrayInputStream(file);
      // String fileStr = new String(file, "ascii");
      // System.out.println(fileStr);
 
      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      factory.setNamespaceAware(true);
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document sourceDoc = builder.parse(inputStream);
 
      //
      // Now build an empty XML DOM document for the Dublin Core
      Document destDoc = builder.newDocument();
      Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/");
      destDoc.appendChild(rootElement);
 
      //
      // Now copy the values from the ETD XML document into
      // the DC XML document
      Element e; String value;
      e=destDoc.createElement("dc:identifier");
      e.appendChild(destDoc.createTextNode(pid));
      rootElement.appendChild(e);
 
      e=destDoc.createElement("dc:title");
      value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
 
      // author's name comes in many parts; this'll put them together
      e = destDoc.createElement("dc:creator");
      String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"};
      String author = new String();
      for (String field : nameFields) {
       value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
       if (value != null && !value.equals("")) {
        author = author.concat(value).concat(" ");
       }
      }
      e.appendChild(destDoc.createTextNode(author.trim()));
      rootElement.appendChild(e);       
 
      e=destDoc.createElement("dc:language");
      value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
 
      e=destDoc.createElement("dc:description");
      value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
 
      e=destDoc.createElement("dc:date");
      value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);       
 
      e=destDoc.createElement("dc:subject");
      value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
 
      //
      // Use a Transformer for output
      TransformerFactory tFactory = TransformerFactory.newInstance();
      Transformer transformer = tFactory.newTransformer();
      transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes");
      DOMSource source = new DOMSource(destDoc);
      StringWriter strWriter = new StringWriter();
      StreamResult result = new StreamResult(strWriter);
      transformer.transform(source, result);
      String xmlAsString=strWriter.getBuffer().toString();
      // System.out.println(xmlAsString);
      byte[] normalarr=xmlAsString.getBytes("UTF-8");
 
      //
      // Lastly, write the modified DC datastream back to the FEDORA server
      apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false);
     }
    }
   } catch (MalformedURLException e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   } catch (Exception e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   }
  }
 }
}