XML : R xml tree to dataframe

I have the following XML tree

  library("XML")  library("RCurl")  url <- "https://doc-0s-9c-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/rk8a2gr7rl8e8s8j0luiak0cahtcjnak/1459080000000/07495711428163271540/*/0BzmnaOABaMIgTEl6SnRUdU9Eb2M?e=download"  bin <- getURL(url)  con <- file("reference2.xml", open = "wb")  writeBin(bin, con)  close(con)  OperationList <- xmlTreeParse("reference.xml", useInternal = TRUE)    

I am able to get one dataframe for plan name and one for operation name.

  planname <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/plans/PlanHeader/name"], xmlValue))  operationanme <- data.frame(sapply(OperationList["//subgroups/OperationGroup/subgroups/OperationGroup/operations/OperationHeader/name"], xmlValue))    

but getting them together in one df (ie. flattening the xml tree) does not work.

I went through multiple approaches (cf below what I tried and the error message I got) but nothing worked so far. Thanks to point me to errors I made.

xmlToDataFrame function

  Operation.df1 <-  xmlToDataFrame(OperationList)    

duplicate subscripts for columns

xmlToDF function

as per https://hopstat.wordpress.com/2014/01/14/faster-xml-conversion-to-data-frames/

  require(XML)  xmlToDF = function(doc, xpath, isXML = TRUE, usewhich = TRUE, verbose = TRUE) {      if (!isXML)       doc = xmlParse(doc)    #### get the records for that form    nodeset <- getNodeSet(doc, xpath)      ## get the field names    var.names <- lapply(nodeset, names)      ## get the total fields that are in any record    fields = unique(unlist(var.names))      ## extract the values from all fields    dl = lapply(fields, function(x) {      if (verbose)       print(paste0("  ", x))      xpathSApply(proc, paste0(xpath, "/", x), xmlValue)    })      ## make logical matrix whether each record had that field    name.mat = t(sapply(var.names, function(x) fields %in% x))    df = data.frame(matrix(NA, nrow = nrow(name.mat), ncol = ncol(name.mat)))    names(df) = fields      ## fill in that data.frame    for (icol in 1:ncol(name.mat)) {      rep.rows = name.mat[, icol]      if (usewhich)       rep.rows = which(rep.rows)      df[rep.rows, icol] = dl[[icol]]    }      return(df)  }    Operation.df2 <- xmlToDF(OperationList,    xpath = "/subgroups/OperationGroup/subgroups/OperationGroup/name")    

Error in name.mat[, icol] : subscript out of bounds

rbind & xpathApply

  require(XML)    Operation.df3 <- xpathApply(OperationList,    "/subgroups/OperationGroup/subgroups/OperationGroup/name",    function(node) {    region <- xmlValue(node[["name"]])    xp <- "./operations/OperationHeader/name"    operation <- xpathSApply(node, xp, xmlValue)    if (is.null(operation)) operation <- NA    data.frame(region, operation, stringsAsFactors = FALSE)  })  do.call(rbind, Operation.df3  )    

gives a NULL

xmlToList and plyr

require(XML) require(plyr) OperationList2 <- xmlToList(OperationList) Operation.df4 <- ldply(OperationList2, data.frame)

*Give me arguments imply differing number of rows: 1, 0

xmlToList, plyr and data.table

  require(data.table)  Operation.df41 <- data.frame(rbindlist(OperationList2))    

Item 1 of list input is not a data.frame, data.table or list

  Operation.df42 <-  rbindlist(OperationList2)    

Item 1 of list input is not a data.frame, data.table or list

  Operation.df43 <- data.frame(matrix(unlist(OperationList2),        byrow=T),stringsAsFactors=FALSE)     

only one column

  Operation.df44 <- lapply(OperationList2, data.frame,    stringsAsFactors = FALSE)     

arguments imply differing number of rows: 1, 0

  Operation.df45 <- rbind.fill(Operation.df44)    

Using a function in a loop

Convert (possibly malformed) xml into Data Frame in R

  xp <- function (OperationList, tag){    n <- xpathSApply(OperationList, tag, xmlValue)    if (length(n) > 0)       # paste multiple values?        paste0(n, collapse="; ")     else NA  }    z <- getNodeSet(OperationList, "//subgroups/OperationGroup/subgroups/OperationGroup")  n <-length(z)  notices <-vector("list",n)  for(i in 1:n)  {    Operation.df5<-xmlDoc(z[[i]])    Operation.df5[[i]] <- data.frame(      region = xp(z2, "//name"),      operation = xp(z2, "//operations/OperationHeader/name"),      stringsAsFactors=FALSE)    free(Operation.df5)    }  do.call("rbind", Operation.df5)    

object of type 'externalptr' is not subsettable

With getNodeSet set

  for (i in 1:length(getNodeSet(OperationList, "//subgroups/OperationGroup")))   {    if (i==1) {      foo<-xmlSApply(OperationList[[i]], xmlValue)      Operation.df6 <-data.frame(t(foo), stringsAsFactors=FALSE)    }    else {      foo<-xmlSApply(OperationList[[i]], xmlValue)      tmp<-data.frame(t(foo), stringsAsFactors=FALSE)      Operation.df6 <-rbind(Operation.df6, tmp)    }  }    

No method for subsetting an XMLInternalDocument with integer

Please, help! what is missing?

No comments:

Post a Comment