These list items are microformat entries and are hidden from view.
https://dltj.org/article/fedora-batch-processing/
We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.I’m posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We’ll probably need to do things like this again…/********************************************************************************** * * Copyright (C) 2006 OhioLINK * * This file is part of the OhioLINK Digital Resource Commons (DRC) Project. * * The OhioLINK DRC is free software; you can redistribute it and/or * modify it under the terms of the Affero General Public License as * published by Affero, Inc. -- either version 1 of the License, or * (at your option) any later version. * * The OhioLINK DRC Project is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY -- without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Affero General Public License for more details. * * You should have received a copy of the Affero General Public * License in the LICENSE.txt file that comes with the DRC project; * if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd, * Suite 300, Columbus, OH 43221, USA. *********************************************************************************/package batch;import java.io.ByteArrayInputStream;import java.io.InputStream;import java.io.StringWriter;import java.net.MalformedURLException;import javax.xml.parsers.DocumentBuilder;import javax.xml.parsers.DocumentBuilderFactory;import javax.xml.transform.Transformer;import javax.xml.transform.TransformerFactory;import javax.xml.transform.dom.DOMSource;import javax.xml.transform.stream.StreamResult;import org.w3c.dom.Document;import org.w3c.dom.Element;import fedora.client.FedoraClient;import fedora.server.access.FedoraAPIA;import fedora.server.management.FedoraAPIM;import fedora.server.types.gen.DatastreamDef;import fedora.server.types.gen.MIMETypedStream;public class Batch { public static void main(String[] args) { for (int i = 80; i < 81; i++) { // "hdl" is our FEDORA PID prefix String pid = "hdl:" + i; try { FedoraClient client = new FedoraClient( "http://fedora.server/fedora", "fedoraAdmin", "password"); FedoraAPIA apia = client.getAPIA(); FedoraAPIM apim = client.getAPIM(); // // Get the list of datastreams for this object. For each one, we're // going to look for an identifier that ends in "etd" DatastreamDef[] datastreams = apia.listDatastreams(pid, null); for (int j = 0; j < datastreams.length; j++) { DatastreamDef def = datastreams[j]; String itemId = def.getID(); if (itemId.endsWith("etd")) { // // If we've found it, get it out of the FEDORA server and // create a XML DOM document for it MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null); byte[] file = ds.getStream(); InputStream inputStream = new ByteArrayInputStream(file); // String fileStr = new String(file, "ascii"); // System.out.println(fileStr); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); Document sourceDoc = builder.parse(inputStream); // // Now build an empty XML DOM document for the Dublin Core Document destDoc = builder.newDocument(); Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc"); rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/"); rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/"); destDoc.appendChild(rootElement); // // Now copy the values from the ETD XML document into // the DC XML document Element e; String value; e=destDoc.createElement("dc:identifier"); e.appendChild(destDoc.createTextNode(pid)); rootElement.appendChild(e); e=destDoc.createElement("dc:title"); value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); // author's name comes in many parts; this'll put them together e = destDoc.createElement("dc:creator"); String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"}; String author = new String(); for (String field : nameFields) { value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); if (value != null && !value.equals("")) { author = author.concat(value).concat(" "); } } e.appendChild(destDoc.createTextNode(author.trim())); rootElement.appendChild(e); e=destDoc.createElement("dc:language"); value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); e=destDoc.createElement("dc:description"); value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); e=destDoc.createElement("dc:date"); value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); e=destDoc.createElement("dc:subject"); value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim(); e.appendChild(destDoc.createTextNode(value)); rootElement.appendChild(e); // // Use a Transformer for output TransformerFactory tFactory = TransformerFactory.newInstance(); Transformer transformer = tFactory.newTransformer(); transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes"); DOMSource source = new DOMSource(destDoc); StringWriter strWriter = new StringWriter(); StreamResult result = new StreamResult(strWriter); transformer.transform(source, result); String xmlAsString=strWriter.getBuffer().toString(); // System.out.println(xmlAsString); byte[] normalarr=xmlAsString.getBytes("UTF-8"); // // Lastly, write the modified DC datastream back to the FEDORA server apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false); } } } catch (MalformedURLException e) { System.out.println(pid+" "+e.getLocalizedMessage()); } catch (Exception e) { System.out.println(pid+" "+e.getLocalizedMessage()); } } }}
2006-12-14T02:18:30+00:00
2018-01-16T03:38:08+00:00

Java Application for Batch Processing FEDORA Objects

Posted on December 14, 2006 3 minute read

× This article was imported from this blog's previous content management system (WordPress), and may have errors in formatting and functionality. If you find these errors are a significant barrier to understanding the article, please let me know.

We had a need today to transform an XML file with a custom DTD into Dublin Core; the custom XML file is a datastream in our FEDORA repository and we want to put the Dublin Core XML file back into the FEDORA object as the DC datastream. This took a slew of technologies and techniques: reading a datastream out of the FEDORA repository using API-A, parsing XML documents using the Java DOM library, creating a new document with the correct namespaces using Java DOM, and modifying the DC datastream in the repository using API-M.

I’m posting the code here in case someone else might find it useful. Of course, if you know a better way please let me know. We’ll probably need to do things like this again…

/**********************************************************************************
 *
 * Copyright (C) 2006 OhioLINK
 *
 * This file is part of the OhioLINK Digital Resource Commons (DRC) Project.
 *
 * The OhioLINK DRC is free software; you can redistribute it and/or
 * modify it under the terms of the Affero General Public License as
 * published by Affero, Inc. -- either version 1 of the License, or
 * (at your option) any later version.
 *
 * The OhioLINK DRC Project is distributed in the hope that it will be
 * useful, but WITHOUT ANY WARRANTY -- without even the implied warranty
 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Affero General Public License for more details.
 *
 * You should have received a copy of the Affero General Public
 * License in the LICENSE.txt file that comes with the DRC project;
 * if not, write to DRC Development Team, OhioLINK, 2455 North Star Rd,
 * Suite 300, Columbus, OH 43221, USA.
 *********************************************************************************/

package batch;

import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.net.MalformedURLException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.w3c.dom.Document;
import org.w3c.dom.Element;

import fedora.client.FedoraClient;
import fedora.server.access.FedoraAPIA;
import fedora.server.management.FedoraAPIM;
import fedora.server.types.gen.DatastreamDef;
import fedora.server.types.gen.MIMETypedStream;

public class Batch {

 public static void main(String[] args) {
  for (int i = 80; i < 81; i++) {
   // "hdl" is our FEDORA PID prefix
   String pid = "hdl:" + i;

   try {
    FedoraClient client = new FedoraClient(
      "http://fedora.server/fedora",
      "fedoraAdmin", "password");
    FedoraAPIA apia = client.getAPIA();
    FedoraAPIM apim = client.getAPIM();
    
    //
    // Get the list of datastreams for this object.  For each one, we're
    // going to look for an identifier that ends in "etd"
    DatastreamDef[] datastreams = apia.listDatastreams(pid, null);
    for (int j = 0; j < datastreams.length; j++) {
     DatastreamDef def = datastreams[j];
     String itemId = def.getID();
     if (itemId.endsWith("etd")) {
      
      //
      // If we've found it, get it out of the FEDORA server and
      // create a XML DOM document for it
      MIMETypedStream ds = apia.getDatastreamDissemination(pid,itemId,null);
      byte[] file = ds.getStream();
      InputStream inputStream = new ByteArrayInputStream(file);
      // String fileStr = new String(file, "ascii");
      // System.out.println(fileStr);

      DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
      factory.setNamespaceAware(true);
      DocumentBuilder builder = factory.newDocumentBuilder();
      Document sourceDoc = builder.parse(inputStream);

      //
      // Now build an empty XML DOM document for the Dublin Core
      Document destDoc = builder.newDocument();
      Element rootElement=destDoc.createElementNS("http://www.openarchives.org/OAI/2.0/oai_dc/","oai_dc:dc");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:oai_dc","http://www.openarchives.org/OAI/2.0/oai_dc/");
      rootElement.setAttributeNS("http://www.w3.org/2000/xmlns/","xmlns:dc","http://purl.org/dc/elements/1.1/");
      destDoc.appendChild(rootElement);

      //
      // Now copy the values from the ETD XML document into
      // the DC XML document
      Element e; String value;
      e=destDoc.createElement("dc:identifier");
      e.appendChild(destDoc.createTextNode(pid));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:title");
      value=sourceDoc.getElementsByTagName("title").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
      
      // author's name comes in many parts; this'll put them together
      e = destDoc.createElement("dc:creator");
      String nameFields[] = { "authfname", "authmname", "authlname", "authsuffix"};
      String author = new String();
      for (String field : nameFields) {
       value = sourceDoc.getElementsByTagName(field).item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
       if (value != null && !value.equals("")) {
        author = author.concat(value).concat(" ");
       }
      }
      e.appendChild(destDoc.createTextNode(author.trim()));
      rootElement.appendChild(e);       

      e=destDoc.createElement("dc:language");
      value=sourceDoc.getElementsByTagName("language").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      e=destDoc.createElement("dc:description");
      value=sourceDoc.getElementsByTagName("abstract").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);
      
      e=destDoc.createElement("dc:date");
      value=sourceDoc.getElementsByTagName("docyear").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);       

      e=destDoc.createElement("dc:subject");
      value = sourceDoc.getElementsByTagName("subjects").item(0).getTextContent().replaceAll("[\t ]*\n[\t ]*", " ").replaceAll("[\t ][\t ]+", " ").trim();
      e.appendChild(destDoc.createTextNode(value));
      rootElement.appendChild(e);

      //
      // Use a Transformer for output
      TransformerFactory tFactory = TransformerFactory.newInstance();
      Transformer transformer = tFactory.newTransformer();
      transformer.setOutputProperty(javax.xml.transform.OutputKeys.OMIT_XML_DECLARATION, "yes");
      DOMSource source = new DOMSource(destDoc);
      StringWriter strWriter = new StringWriter();
      StreamResult result = new StreamResult(strWriter);
      transformer.transform(source, result);
      String xmlAsString=strWriter.getBuffer().toString();
      // System.out.println(xmlAsString);
      byte[] normalarr=xmlAsString.getBytes("UTF-8");
      
      //
      // Lastly, write the modified DC datastream back to the FEDORA server
      apim.modifyDatastreamByValue(pid, "DC", null, "Dublin Core", false, "text/xml", null, normalarr, "A", "Batch program to add DC datastream from ETD XML file", false);
     }
    }
   } catch (MalformedURLException e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   } catch (Exception e) {
    System.out.println(pid+" "+e.getLocalizedMessage());
   }
  }
 }
}

Social Media Interactions

No reposts were found.

No likes were found.

No webmentions were found.

Share on

Mastodon/Fediverse Twitter Facebook LinkedIn

Java Application for Batch Processing FEDORA Objects

Social Media Interactions

Share on

You may also enjoy

Learnings from the British Library Cybersecurity Report

One Year of Learning 2023

Restoring Obsidian Knowledgebase from MacOS Time Machine at the Command Line

Processing WOLFcon Conference Recordings with FFMPEG

Likes

Reposts

Discussion