These list items are microformat entries and are hidden from view.
https://dltj.org/article/fedora-batch-processing/
We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.I’m posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We’ll probably need to do things like this again…/********************************************************************************** * * Copyright (C) 2006 OhioLINK * * This file is part of the OhioLINK Digital Resource Commons (DRC) Project. * * The OhioLINK DRC is free software; you can redistribute it and/or * modify it under the terms of the Affero General Public License as * published by Affero, Inc. -- either version 1 of the License, or * (at your option) any later version. * * The OhioLINK DRC Project is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY -- without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Affero General Public License for more details. * * You should have received a copy of the Affero General Public * License in the LICENSE.txt file that comes with the DRC project; * if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd, * Suite 300, Columbus, OH 43221, USA. *********************************************************************************/package batch;import java.io.ByteArrayInputStream;import java.io.InputStream;import java.io.StringWriter;import java.net.MalformedURLException;import javax.xml.parsers.DocumentBuilder;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import org.w3c.dom.Document;import org.w3c.dom.Element;import fedora.client.FedoraClient;import fedora.server.access.FedoraAPIA;import fedora.server.management.FedoraAPIM;import fedora.server.types.gen.DatastreamDef;import fedora.server.types.gen.MIMETypedStream;public class Batch { public static void main(String[] args) { for (int i = 80; i < 81; i++) { // "hdl" is our FEDORA PID prefix String pid = "hdl:" + i; try { FedoraClient client = new FedoraClient( "http://fedora.server/fedora", "fedoraAdmin", "password"); FedoraAPIA apia = client.getAPIA(); FedoraAPIM apim = client.getAPIM(); // // Get the list of datastreams for this object. For each one, we're // going to look for an identifier that ends in "etd" DatastreamDef[] datastreams = apia.listDatastreams(pid, null); for (int j = 0; j < datastreams.length; j++) { DatastreamDef def = datastreams[j]; String itemId = def.getID(); if (itemId.endsWith("etd")) { // // If we've found it, get it out of the FEDORA server and // create a XML DOM document for it MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null); byte[] file = ds.getStream(); InputStream inputStream = new ByteArrayInputStream(file); // String fileStr = new String(file, "ascii"); // System.out.println(fileStr); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); Document sourceDoc = builder.parse(inputStream); // // Now build an empty XML DOM document for the Dublin Core Document destDoc = builder.newDocument(); Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc"); rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/"); rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/"); destDoc.appendChild(rootElement); // // Now copy the values from the ETD XML document into // the DC XML document Element e; String value; e=destDoc.createElement("dc:identifier"); e.appendChild(destDoc.createTextNode(pid)); rootElement.appendChild(e); e=destDoc.createElement("dc:title"); value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); // author's name comes in many parts; this'll put them together e = destDoc.createElement("dc:creator"); String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"}; String author = new String(); for (String field : nameFields) { value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); if (value != null && !value.equals("")) { author = author.concat(value).concat(" "); } } e.appendChild(destDoc.createTextNode(author.trim())); rootElement.appendChild(e); e=destDoc.createElement("dc:language"); value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); e=destDoc.createElement("dc:description"); value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); e=destDoc.createElement("dc:date"); value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); e=destDoc.createElement("dc:subject"); value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); // // Use a Transformer for output TransformerFactory tFactory = TransformerFactory.newInstance(); Transformer transformer = tFactory.newTransformer(); transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes"); DOMSource source = new DOMSource(destDoc); StringWriter strWriter = new StringWriter(); StreamResult result = new StreamResult(strWriter); transformer.transform(source, result); String xmlAsString=strWriter.getBuffer().toString(); // System.out.println(xmlAsString); byte[] normalarr=xmlAsString.getBytes("UTF-8"); // // Lastly, write the modified DC datastream back to the FEDORA server apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false); } } } catch (MalformedURLException e) { System.out.println(pid+" "+e.getLocalizedMessage()); } catch (Exception e) { System.out.println(pid+" "+e.getLocalizedMessage()); } } }}
2006-12-14T02:18:30+00:00
2024-07-20T16:35:17+00:00

Java Application for Batch Processing FEDORA Objects

Posted on December 14, 2006 3 minute read

× This article was imported from this blog's previous content management system (WordPress), and may have errors in formatting and functionality. If you find these errors are a significant barrier to understanding the article, please let me know.

We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.

I’m posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We’ll probably need to do things like this again…

/**********************************************************************************
 *
 * Copyright (C) 2006 OhioLINK
 *
 * This file is part of the OhioLINK Digital Resource Commons (DRC) Project.
 *
 * The OhioLINK DRC is free software; you can redistribute it and/or
 * modify it under the terms of the Affero General Public License as
 * published by Affero, Inc. -- either version 1 of the License, or
 * (at your option) any later version.
 *
 * The OhioLINK DRC Project is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY -- without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Affero General Public License for more details.
 *
 * You should have received a copy of the Affero General Public
 * License in the LICENSE.txt file that comes with the DRC project;
 * if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd,
 * Suite 300, Columbus, OH 43221, USA.
 *********************************************************************************/

package batch;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

import fedora.client.FedoraClient;
import fedora.server.access.FedoraAPIA;
import fedora.server.management.FedoraAPIM;
import fedora.server.types.gen.DatastreamDef;
import fedora.server.types.gen.MIMETypedStream;

public class Batch {

 public static void main(String[] args) {
  for (int i = 80; i < 81; i++) {
   // "hdl" is our FEDORA PID prefix
   String pid = "hdl:" + i;

   try {
    FedoraClient client = new FedoraClient(
      "http://fedora.server/fedora",
      "fedoraAdmin", "password");
    FedoraAPIA apia = client.getAPIA();
    FedoraAPIM apim = client.getAPIM();
    
    //
    // Get the list of datastreams for this object.  For each one, we're
    // going to look for an identifier that ends in "etd"
    DatastreamDef[] datastreams = apia.listDatastreams(pid, null);
    for (int j = 0; j < datastreams.length; j++) {
     DatastreamDef def = datastreams[j];
     String itemId = def.getID();
     if (itemId.endsWith("etd")) {
      
      //
      // If we've found it, get it out of the FEDORA server and
      // create a XML DOM document for it
      MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null);
      byte[] file = ds.getStream();
      InputStream inputStream = new ByteArrayInputStream(file);
      // String fileStr = new String(file, "ascii");
      // System.out.println(fileStr);

      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      factory.setNamespaceAware(true);
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document sourceDoc = builder.parse(inputStream);

      //
      // Now build an empty XML DOM document for the Dublin Core
      Document destDoc = builder.newDocument();
      Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/");
      destDoc.appendChild(rootElement);

      //
      // Now copy the values from the ETD XML document into
      // the DC XML document
      Element e; String value;
      e=destDoc.createElement("dc:identifier");
      e.appendChild(destDoc.createTextNode(pid));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:title");
      value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
      
      // author's name comes in many parts; this'll put them together
      e = destDoc.createElement("dc:creator");
      String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"};
      String author = new String();
      for (String field : nameFields) {
       value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
       if (value != null && !value.equals("")) {
        author = author.concat(value).concat(" ");
       }
      }
      e.appendChild(destDoc.createTextNode(author.trim()));
      rootElement.appendChild(e);       

      e=destDoc.createElement("dc:language");
      value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:description");
      value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
      
      e=destDoc.createElement("dc:date");
      value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);       

      e=destDoc.createElement("dc:subject");
      value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      //
      // Use a Transformer for output
      TransformerFactory tFactory = TransformerFactory.newInstance();
      Transformer transformer = tFactory.newTransformer();
      transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes");
      DOMSource source = new DOMSource(destDoc);
      StringWriter strWriter = new StringWriter();
      StreamResult result = new StreamResult(strWriter);
      transformer.transform(source, result);
      String xmlAsString=strWriter.getBuffer().toString();
      // System.out.println(xmlAsString);
      byte[] normalarr=xmlAsString.getBytes("UTF-8");
      
      //
      // Lastly, write the modified DC datastream back to the FEDORA server
      apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false);
     }
    }
   } catch (MalformedURLException e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   } catch (Exception e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   }
  }
 }
}

Social Media Interactions

No reposts were found.

No likes were found.

No webmentions were found.

Share on

Mastodon/Fediverse Twitter Facebook LinkedIn

Java Application for Batch Processing FEDORA Objects

Social Media Interactions

Share on

You may also enjoy

Ghost Newsletter Software Findings: Got Past the Mailgun Problem, but Got Stuck On Ugly HTML

Digital versus Digitized: On the Hachette v. Internet Archive Appeal Oral Argument

The ILS without patron data: open questions

The ILS without patron data: a thought experiment realized with FOLIO

Likes

Reposts

Discussion