I have a function whose aim is to parse any XML data (using Duncan Temple Lang's XML package) given to it and produce output in a data.frame as shown in the desired output below. The getValues function is meant to produce list of attribute names and its associated values which then get passed into a data frame when called. But it doesn't work that way as not all attribute names and values appear in the data frame. Please see below for sample XML and the output I am currently getting.
library(XML)
getValues <- function(x) {
aList <- list()
#attributes
if(!is.null(xmlAttrs(x))) {
num.attributes = xmlSize(xmlAttrs(x))
for (i in seq_len(num.attributes)) {
attributeName <- names(xmlAttrs(x)[i])
attributeValue <- xmlAttrs(x)[[i]]
aList <- append(aList, c(Name = attributeName, Text = attributeValue))
}
}
return(aList)
}
retrieveStructureInfo <- function(node) {
if (is.null(node)) {
return()
}
nkids <- xmlSize(node)
bypass <- function(n = nkids) {
if(nkids == 0) {
xpathApply(xmlParent(node), path = xpath, fun = getValues)
} else {
return(nkids)
}
}
#children is the no. of nodes within a node
for (i in 1 : children) {
#recursive function call
retrieveStructureInfo(node[[i]])
}
}
#parse xml document
#xmlfile is the file path
doc <- xmlParse(xmlfile)
r <- xmlRoot(doc)
data <- data.frame(node = NA, value = NA)
retrieveStructureInfo(r)
data
Sample XML:
<CATALOG>
<PLANT>
<COMMON Source="a" Available="false">Bloodroot</COMMON>
<LOCATION></LOCATION>
<PARENT />
</PLANT>
<PLANT>
<COMMON Source="b" Available="true">Columbine</COMMON>
<LOCATION>Africa</LOCATION>
<PARENT />
</PLANT>
</CATALOG>
Output:
node value
source a
source b
Desired Output:
node value
source a
available false
source b
available true
No comments:
Post a Comment