Complex XML to TSV using XSLT



I have found a couple of previous questions that address parts of my problem (see here and here, but I'm having trouble integrating them. I have a set of XML records that I want to transform to tab-delimited format. However, not all the XML records have all fields, and some contain multiple instances of a field.


Two sample XML records:



<?xml version="1.0" encoding="UTF-8" ?>
<marc:collection xmlns:marc="http://ift.tt/1sMy6Fc"
xmlns:xsi="http://ift.tt/ra1lAU"
xsi:schemaLocation="http://ift.tt/1sMy6Fc http://ift.tt/1wDzy1b">
<marc:record>
<marc:leader>02179 am a 002893u </marc:leader>
<marc:controlfield tag="001">12789</marc:controlfield>
<marc:controlfield tag="005">20120521</marc:controlfield>
<marc:controlfield tag="007">cuuuu---auuuu</marc:controlfield>
<marc:controlfield tag="008">120521s|||| xx o 0 u ||| |</marc:controlfield>
<marc:datafield tag="020" ind1=" " ind2=" ">
<marc:subfield code="a">9789089640574</marc:subfield>
</marc:datafield>
<marc:datafield tag="100" ind1="1" ind2=" ">
<marc:subfield code="a">Rooij van ,Robert</marc:subfield>
<marc:subfield code="4">aut</marc:subfield>
</marc:datafield>
<marc:datafield tag="245" ind1="1" ind2=" ">
<marc:subfield code="a">New Perspectives on Games and Interaction</marc:subfield>
</marc:datafield>
<marc:datafield tag="260" ind1=" " ind2=" ">
<marc:subfield code="b">Amsterdam University Press</marc:subfield>
<marc:subfield code="c">2008</marc:subfield>
</marc:datafield>
<marc:datafield tag="300" ind1=" " ind2=" ">
<marc:subfield code="a">1 electronic resource (330 p.)</marc:subfield>
</marc:datafield>
<marc:datafield tag="520" ind1=" " ind2=" ">
<marc:subfield code="a">This volume is a collection of papers ...</marc:subfield>
</marc:datafield>
<marc:datafield tag="650" ind1=" " ind2="0">
<marc:subfield code="a">Mathematics</marc:subfield>
</marc:datafield>
<marc:datafield tag="650" ind1=" " ind2="0">
<marc:subfield code="a">Philosophy (General)</marc:subfield>
</marc:datafield>
<marc:datafield tag="650" ind1=" " ind2="0">
<marc:subfield code="a">Economic theory. Demography</marc:subfield>
</marc:datafield>
<marc:datafield tag="653" ind1=" " ind2=" ">
<marc:subfield code="a">Economics</marc:subfield>
</marc:datafield>
<marc:datafield tag="653" ind1=" " ind2=" ">
<marc:subfield code="a">Philosophy</marc:subfield>
</marc:datafield>
<marc:datafield tag="653" ind1=" " ind2=" ">
<marc:subfield code="a">Mathematics</marc:subfield>
</marc:datafield>
<marc:datafield tag="653" ind1=" " ind2=" ">
<marc:subfield code="a">Economie</marc:subfield>
</marc:datafield>
<marc:datafield tag="653" ind1=" " ind2=" ">
<marc:subfield code="a">Filosofie</marc:subfield>
</marc:datafield>
<marc:datafield tag="653" ind1=" " ind2=" ">
<marc:subfield code="a">Wiskunde</marc:subfield>
</marc:datafield>
<marc:datafield tag="700" ind1="1" ind2=" ">
<marc:subfield code="a">Apt ,Krzysztof</marc:subfield>
<marc:subfield code="4">aut</marc:subfield>
</marc:datafield>
<marc:datafield tag="856" ind1="4" ind2="0">
<marc:subfield code="u">http://ift.tt/1tRbuCv;
<marc:subfield code="z">Description of rights in Directory of Open Access Books (DOAB): Attribution Non-commercial (CC by-nc)</marc:subfield>
</marc:datafield>
<marc:datafield tag="856" ind1="4" ind2="0">
<marc:subfield code="u">http://ift.tt/1tRbuSP;
</marc:datafield>
</marc:record>
<marc:record>
<marc:leader>01452 am a 001933u </marc:leader>
<marc:controlfield tag="001">15497</marc:controlfield>
<marc:controlfield tag="005">20140217</marc:controlfield>
<marc:controlfield tag="007">cuuuu---auuuu</marc:controlfield>
<marc:controlfield tag="008">140217s|||| xx o 0 u ||| |</marc:controlfield>
<marc:datafield tag="020" ind1=" " ind2=" ">
<marc:subfield code="a">9788867050673</marc:subfield>
</marc:datafield>
<marc:datafield tag="100" ind1="1" ind2=" ">
<marc:subfield code="a">Emanuele Haus</marc:subfield>
<marc:subfield code="4">aut</marc:subfield>
</marc:datafield>
<marc:datafield tag="245" ind1="1" ind2=" ">
<marc:subfield code="a">Dynamics of an elastic satellite with internal friction.</marc:subfield>
</marc:datafield>
<marc:datafield tag="260" ind1=" " ind2=" ">
<marc:subfield code="b">Ledizioni - LediPublishing</marc:subfield>
<marc:subfield code="c">2013</marc:subfield>
</marc:datafield>
<marc:datafield tag="300" ind1=" " ind2=" ">
<marc:subfield code="a">1 electronic resource ( p.)</marc:subfield>
</marc:datafield>
<marc:datafield tag="520" ind1=" " ind2=" ">
<marc:subfield code="a">n this thesis, we study the dynamics...</marc:subfield>
</marc:datafield>
<marc:datafield tag="546" ind1=" " ind2=" ">
<marc:subfield code="a">english</marc:subfield>
</marc:datafield>
<marc:datafield tag="650" ind1=" " ind2="0">
<marc:subfield code="a">Mathematics</marc:subfield>
</marc:datafield>
<marc:datafield tag="856" ind1="4" ind2="0">
<marc:subfield code="u">http://ift.tt/1tRbsdK;
<marc:subfield code="z">Description of rights in Directory of Open Access Books (DOAB): Attribution Non-commercial Share Alike (CC by-nc-sa)</marc:subfield>
</marc:datafield>
<marc:datafield tag="856" ind1="4" ind2="0">
<marc:subfield code="u">http://ift.tt/1ywm5YL;
</marc:datafield>
</marc:record>
</marc:collection>


I've been trying to adapt the XSLT from this previous answer, with little luck so far:



<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://ift.tt/tCZ8VR"
xpath-default-namespace="http://ift.tt/1sMy6Fc">
<xsl:output method="text"/>
<xsl:variable name="delimiter" select="'&#09;'"/>

<xsl:strip-space elements="*"/>
<xsl:output method="text"/>

<xsl:key name="field" match="/collection/record/datafield/subfield" use="concat(../@tag,@code)"/>

<!-- variable containing the first occurrence of each field -->
<xsl:variable name="allFields"
select="/collection/record/datafield/subfield[generate-id()=generate-id(key('field', concat(../@tag,@code))[1])]" />

<xsl:template match="/">

<xsl:for-each select="$allFields">
<xsl:sort select="substring(concat(../@tag,@code),1,3)" data-type="number"/>
<xsl:value-of select="concat(../@tag,@code)" />
<xsl:if test="position() &lt; last()">
<xsl:value-of select="$delimiter" />
</xsl:if>
</xsl:for-each>
<xsl:text>&#10;</xsl:text>
<xsl:apply-templates select="*/*" />
</xsl:template>

<xsl:template match="*">
<xsl:variable name="this" select="." />

<xsl:for-each select="$allFields">
<xsl:sort select="substring(concat(../@tag,@code),1,3)" data-type="number"/>
<xsl:value-of select="$this/*[@code = current()/@code]" />
<xsl:if test="position() &lt; last()">
<xsl:value-of select="$delimiter" />
</xsl:if>
</xsl:for-each>
<xsl:text>&#10;</xsl:text>
</xsl:template>
</xsl:stylesheet>


In the output I'm trying to achieve, the header would consist of the leader followed by the unique values of @tag (concatenated with subfield/@code for subfields), sorted in ascending order by tag:



leader 001 005 007 008 020a 100a 1004 245a 260b 260c 300a 520a 546a 650a 653a 700a 7004 856u 856z


If a record has multiple values for a single field/subfield combination, I want to concantenate them together, for example:



653a
Economics|Philosophy|Mathematics


However, if a record is missing a particular field, I want to just output a tab character, to keep everything aligned.


No comments:

Post a Comment