Java Application for Batch Processing FEDORA Objects

We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.

I'm posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We'll probably need to do things like this again...

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/**********************************************************************************
 *
 * Copyright (C) 2006 OhioLINK
 *
 * This file is part of the OhioLINK Digital Resource Commons (DRC) Project.
 *
 * The OhioLINK DRC is free software; you can redistribute it and/or
 * modify it under the terms of the Affero General Public License as
 * published by Affero, Inc. -- either version 1 of the License, or
 * (at your option) any later version.
 *
 * The OhioLINK DRC Project is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY -- without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Affero General Public License for more details.
 *
 * You should have received a copy of the Affero General Public
 * License in the LICENSE.txt file that comes with the DRC project;
 * if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd,
 * Suite 300, Columbus, OH 43221, USA.
 *********************************************************************************/

package batch;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

import fedora.client.FedoraClient;
import fedora.server.access.FedoraAPIA;
import fedora.server.management.FedoraAPIM;
import fedora.server.types.gen.DatastreamDef;
import fedora.server.types.gen.MIMETypedStream;

public class Batch {

 public static void main(String[] args) {
  for (int i = 80; i < 81; i++) {
   // "hdl" is our FEDORA PID prefix
   String pid = "hdl:" + i;

   try {
    FedoraClient client = new FedoraClient(
      "http://fedora.server/fedora",
      "fedoraAdmin", "password");
    FedoraAPIA apia = client.getAPIA();
    FedoraAPIM apim = client.getAPIM();

    //
    // Get the list of datastreams for this object.  For each one, we're
    // going to look for an identifier that ends in "etd"
    DatastreamDef[] datastreams = apia.listDatastreams(pid, null);
    for (int j = 0; j < datastreams.length; j++) {
     DatastreamDef def = datastreams[j];
     String itemId = def.getID();
     if (itemId.endsWith("etd")) {

      //
      // If we've found it, get it out of the FEDORA server and
      // create a XML DOM document for it
      MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null);
      byte[] file = ds.getStream();
      InputStream inputStream = new ByteArrayInputStream(file);
      // String fileStr = new String(file, "ascii");
      // System.out.println(fileStr);

      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      factory.setNamespaceAware(true);
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document sourceDoc = builder.parse(inputStream);

      //
      // Now build an empty XML DOM document for the Dublin Core
      Document destDoc = builder.newDocument();
      Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/");
      destDoc.appendChild(rootElement);

      //
      // Now copy the values from the ETD XML document into
      // the DC XML document
      Element e; String value;
      e=destDoc.createElement("dc:identifier");
      e.appendChild(destDoc.createTextNode(pid));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:title");
      value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      // author's name comes in many parts; this'll put them together
      e = destDoc.createElement("dc:creator");
      String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"};
      String author = new String();
      for (String field : nameFields) {
       value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
       if (value != null && !value.equals("")) {
        author = author.concat(value).concat(" ");
       }
      }
      e.appendChild(destDoc.createTextNode(author.trim()));
      rootElement.appendChild(e);       

      e=destDoc.createElement("dc:language");
      value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:description");
      value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:date");
      value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);       

      e=destDoc.createElement("dc:subject");
      value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      //
      // Use a Transformer for output
      TransformerFactory tFactory = TransformerFactory.newInstance();
      Transformer transformer = tFactory.newTransformer();
      transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes");
      DOMSource source = new DOMSource(destDoc);
      StringWriter strWriter = new StringWriter();
      StreamResult result = new StreamResult(strWriter);
      transformer.transform(source, result);
      String xmlAsString=strWriter.getBuffer().toString();
      // System.out.println(xmlAsString);
      byte[] normalarr=xmlAsString.getBytes("UTF-8");

      //
      // Lastly, write the modified DC datastream back to the FEDORA server
      apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false);
     }
    }
   } catch (MalformedURLException e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   } catch (Exception e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   }
  }
 }
}