Repeating Elements To New Rows Elementtree
Solution 1:
This code should do:
from xml.etree import ElementTree as ET
from collections import defaultdict
import csv
from pathlib import Path
directory = '.'withopen('output.csv', 'w', newline='') as f:
writer = csv.writer(f)
headers = ['id', 'service_code', 'rational', 'qualify', 'description_num', 'description_txt', 'set_data_xin', 'set_data_xax', 'set_data_value', 'set_data_x']
writer.writerow(headers)
xml_files_list = list(map(str, Path(directory).glob('**/*.xml')))
print(xml_files_list)
for xml_file in xml_files_list:
tree = ET.parse(xml_file)
root = tree.getroot()
start_nodes = root.findall('.//START')
for sn in start_nodes:
row = defaultdict(str)
repeated_values = dict()
for k,v in sn.attrib.items():
repeated_values[k] = v
for rn in sn.findall('.//Rational'):
repeated_values['rational'] = rn.text
for qu in sn.findall('.//Qualify'):
repeated_values['qualify'] = qu.text
for ds in sn.findall('.//Description'):
repeated_values['description_txt'] = ds.text
repeated_values['description_num'] = ds.attrib['num']
for st in sn.findall('.//SetData'):
for k,v in st.attrib.items():
row['set_data_'+ str(k)] = v
for key in repeated_values.keys():
row[key] = repeated_values[key]
row_data = [row[i] for i in headers]
writer.writerow(row_data)
row = defaultdict(str)
Explanation
The problem is that all nodes of the XML document in the code from the question are getting written to CSV only once. Provided that we have SetData
the most repeatable node we can use this to identify the number of times we have to write other data.
This approach would not work if this assumption is incorrect.
Solution 2:
Consider the special purpose language, XSLT, using Python's third-party module, lxml
, to directly transform XML to CSV output. Specifically, have XSLT pull from the lower level, SetData
and retrieve upper level information with ancestor
.
XSLT(save as .xsl file, a special .xml file)
<xsl:stylesheetversion="1.0"xmlns:xsl="http://www.w3.org/1999/XSL/Transform"><xsl:outputindent="yes"method="text"/><xsl:strip-spaceelements="*"/><xsl:variablename="delim">,</xsl:variable><xsl:templatematch="/ProjectData"><!------------------------------- HEADERS -------------------------------><xsl:text>id,service_code,rational,qualify,description_num,description,</xsl:text><xsl:text>data_file_dg,data_file_dg_id,data_file_unit,data_file_unit_id,</xsl:text><xsl:text>set_data_x,set_data_xin,set_data_xat,set_data_value
</xsl:text><!-----------------------------------------------------------------------><xsl:apply-templatesselect="descendant::SetData"/></xsl:template><xsl:templatematch="SetData"><xsl:value-ofselect="concat(ancestor::START/@id, $delim,
ancestor::START/@service_code, $delim,
ancestor::START/*[1]/Rational, $delim,
ancestor::START/*[1]/Qualify, $delim,
ancestor::START/Description/@num, $delim,
ancestor::START/Description, $delim,
ancestor::START/DataFile/@dg, $delim,
ancestor::START/DataFile/@dg_id, $delim,
ancestor::START/DataFile/@unit, $delim,
ancestor::START/DataFile/@unit_id, $delim,
@x, $delim,
@xin, $delim,
@xat, $delim,
@value)"/><xsl:text>
</xsl:text></xsl:template></xsl:stylesheet>
Python(no for
loops or if
/else
logic)
import lxml.etree as et
# LOAD XML AND XSL FILES
xml = et.parse('Input.xml')
xsl = et.parse('Script.xsl')
# INITIALIZE TRANSFORMER
transform = et.XSLT(xsl)
# TRANSFORM INPUT
result = transform(xml)
print(str(result))
# id,service_code,rational,qualify,description_num,description,data_file_dg,data_file_dg_id,data_file_unit,data_file_unit_id,set_data_x,set_data_xin,set_data_xat,set_data_value# ID0001,0x5196,225196,6251960000A0DE,1213f2312,The parameter,12,let,,,,,,32# DG0003,0x517B,23423,342342,3423423f3423,The third,55,big,,,E1,,,21259# DG0003,0x517B,23423,342342,3423423f3423,The third,55,big,,,E2,,,02# ID0048,0x5198,225198,343243324234234,434234234,The forth,,,21,FEDS,,5,,323# ID0048,0x5198,225198,343243324234234,434234234,The forth,,,21,FEDS,,123,,555# ID0048,0x5198,225198,343243324234234,434234234,The forth,,,21,FEDS,,17,,23# SAVE XML TO CSVwithopen('Output.csv', 'wb') as f:
f.write(str(result))
To loop across a folder of XML files, simply integrate above in a loop. Here wraps all XML processing into a single method to build a list of results via list comprehension and finally written to CSV iteratively. NOTE: For one set of headers, place headers only in CSV and remove from XSLT as indicated above.
import lxml.etree as et
from pathlib import Path
# LOAD XSL SCRIPT
xsl = et.parse('Script.xsl') # LOAD XML FILE ONCE (REMOVE HEADERS)defproc_xml(xml_file):
xml = et.parse(xml_file) # LOAD XML FILE
transform = et.XSLT(xsl) # INITIALIZE TRANSFORMER
result = transform(xml) # TRANSFORM INPUT returnstr(result)
xml_files_list = list(map(str,Path(directory).glob('**/*.xml')))
results = [proc_xml(x) for x in xml_files_list]
withopen('Output.csv', 'w', newline='') as f:
f.write('id,service_code,rational,qualify,description_num,description,''data_file_dg,data_file_dg_id,data_file_unit,data_file_unit_id,''set_data_x,set_data_xin,set_data_xat,set_data_value\n')
# SAVE XML TO CSVfor r in results:
f.write(r)
Post a Comment for "Repeating Elements To New Rows Elementtree"