问题描述
我有一个带有嵌套标签的 XML 文件。我们可以使用 DOM、JDOM 解析器 我想在整个 XML 文件中将所有标签的字符串从单引号(')替换为双引号。标签也可以嵌套在标签内。我想要一些查找所有标记并替换值的 for 循环,例如 HYPER SHIPPING'SDN BHD_First_Page --> HYPER SHIPPING''SDN BHD_First_Page
示例代码
public void iterateChildNodes(org.jdom.Element parentNode) {
if(parentNode.getChildren().size() == 0) {
if(parentNode.getText().contains("'")) {
parentNode.setText(parentNode.getText().replaceAll("'","\'"));
LOGGER.info("************* Below Value updated");
LOGGER.info(parentNode.getText());
}
}else {
List<Element> rec = parentNode.getChildren();
for(Element i : rec) {
iterateChildNodes(i);
}
}
}
示例 XML 文件
<Document>
<Identifier>DOC1</Identifier>
<Type>HYPER SHIPPING SDN BHD</Type>
<Description>HYPER SHIPPING SDN BHD</Description>
<Confidence>33.12</Confidence>
<ConfidenceThreshold>10.0</ConfidenceThreshold>
<Valid>true</Valid>
<Reviewed>true</Reviewed>
<ReviewedBy>SYstem</ReviewedBy>
<ValidatedBy>SYstem</ValidatedBy>
<ErrorMessage/>
<Value>HYPER SHIPPING'SDN BHD_First_Page</Value> //Value to be replaced here
<DocumentdisplayInfo/>
<DocumentLevelFields/>
<Pages>
<Page>
<Identifier>PG0</Identifier>
<OldFileName>HYPER-KL FEB-0001-0001.tif</OldFileName>
<NewFileName>BI2E7_0.tif</NewFileName>
<SourceFileID>1</SourceFileID>
<PageLevelFields>
<PageLevelField>
<Name>Search_Engine_Classification</Name>
<Value>Park Street '10 road</Value> //Value to be replaced here
<Type/>
<Confidence>66.23</Confidence>
<LearnedFileName>HYPER KL-JUN-0001.tif</LearnedFileName>
<OcrConfidenceThreshold>0.0</OcrConfidenceThreshold>
<OcrConfidence>0.0</OcrConfidence>
<FieldOrderNumber>0</FieldOrderNumber>
<ForceReview>false</ForceReview>
</PageLevelField>
</PageLevelFields>
</Page>
</Pages>
</Document>
解决方法
此代码可以将 XML 文件中的所有 '
替换为 "
。
这里不加说明,一步一步尝试编码。很容易理解。
(更新)
Part 1: Using JDOM
import java.util.ArrayList;
import java.util.List;
import org.w3c.dom.NodeList;
import org.jdom2.input.SAXBuilder;
import org.jdom2.transform.JDOMSource;
import org.w3c.dom.*;
import java.io.*;
public class XmlModificationJDom {
public static void main(String[] args) {
XmlModificationJDom xmlModificationJDom = new XmlModificationJDom();
xmlModificationJDom.updateXmlAndSaveJDom();
}
public void updateXmlAndSaveJDom() {
try {
File inputFile = new File("document.xml");
SAXBuilder saxBuilder = new SAXBuilder();
org.jdom2.Document xmlDocument = saxBuilder.build(inputFile);
org.jdom2.Element rootElement = xmlDocument.getRootElement();
iterateAndUpdateElementsUsingJDom(rootElement);
saveUpdatedXmlUsingJDomSource(xmlDocument);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public void iterateAndUpdateElementsUsingJDom(org.jdom2.Element element) {
if (element.getChildren().size() == 0) {
// System.out.println(element.getName() + ","+ element.getText());
if (element.getText().contains("'")) {
element.setText(element.getText().replaceAll("\'","\""));
}
} else {
// System.out.println(element.getName());
for (org.jdom2.Element childElement : element.getChildren()) {
iterateAndUpdateElementsUsingJDom(childElement);
}
}
}
}
Part 2: Using DOM
import javax.xml.parsers.*;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.util.ArrayList;
import java.util.List;
import java.io.*;
public class XmlModificationDom {
public static void main(String[] args) {
XmlModificationDom XmlModificationDom = new XmlModificationDom();
XmlModificationDom.updateXmlAndSave();
}
public void updateXmlAndSave() {
try {
File inputFile = new File("document.xml");
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document document = dBuilder.parse(inputFile);
document.getDocumentElement().normalize();
Node parentNode = document.getFirstChild();
iterateChildNodesAndUpate(parentNode);
writeAndSaveXML(document);
} catch (Exception ex) {
ex.printStackTrace();
}
}
public void writeAndSaveXML(Document document) throws Exception {
TransformerFactory transformerFactory = TransformerFactory.newInstance();
Transformer transformer = transformerFactory.newTransformer();
DOMSource source = new DOMSource(document);
StreamResult result = new StreamResult(new File("updated-document.xml"));
transformer.transform(source,result);
}
public void iterateChildNodesAndUpate(Node parentNode) {
NodeList nodeList = parentNode.getChildNodes();
for (int index = 0; index < nodeList.getLength(); index++) {
Node node = nodeList.item(index);
if (node.getNodeType() == Node.ELEMENT_NODE) {
Element element = (Element) node;
//System.out.print(element.getNodeName());
if (element.hasChildNodes() && element.getChildNodes().getLength() > 1) {
//System.out.println("Child > " + element.getNodeName());
iterateChildNodesAndUpate(element);
} else {
//System.out.println(" - " + element.getTextContent());
if (element.getTextContent().contains("'")) {
String str = element.getTextContent().replaceAll("\'","\"");
element.setTextContent(str);
}
}
}
}
}
}
输入文件document.xml
:
<Document>
<Identifier>DOC1</Identifier>
<Type>HYPER SHIPPING SDN BHD</Type>
<Description>HYPER SHIPPING SDN BHD</Description>
<Confidence>33.12</Confidence>
<ConfidenceThreshold>10.0</ConfidenceThreshold>
<Valid>true</Valid>
<Reviewed>true</Reviewed>
<ReviewedBy>SYSTEM</ReviewedBy>
<ValidatedBy>SYSTEM</ValidatedBy>
<ErrorMessage/>
<Value>HYPER SHIPPING'SDN BHD_First_Page</Value> //Value to be replaced here
<DocumentDisplayInfo/>
<DocumentLevelFields/>
<Pages>
<Page>
<Identifier>PG0</Identifier>
<OldFileName>HYPER-KL FEB-0001-0001.tif</OldFileName>
<NewFileName>BI2E7_0.tif</NewFileName>
<SourceFileID>1</SourceFileID>
<PageLevelFields>
<PageLevelField>
<Name>Search_Engine_Classification</Name>
<Value>Park Street '10 road</Value> //Value to be replaced here
<Type/>
<Confidence>66.23</Confidence>
<LearnedFileName>HYPER KL-JUN-0001.tif</LearnedFileName>
<OcrConfidenceThreshold>0.0</OcrConfidenceThreshold>
<OcrConfidence>0.0</OcrConfidence>
<FieldOrderNumber>0</FieldOrderNumber>
<ForceReview>false</ForceReview>
</PageLevelField>
</PageLevelFields>
</Page>
</Pages>
</Document>
输出updated-document.xml/updated-document-jdom.xml
:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<Document>
<Identifier>DOC1</Identifier>
<Type>HYPER SHIPPING SDN BHD</Type>
<Description>HYPER SHIPPING SDN BHD</Description>
<Confidence>33.12</Confidence>
<ConfidenceThreshold>10.0</ConfidenceThreshold>
<Valid>true</Valid>
<Reviewed>true</Reviewed>
<ReviewedBy>SYSTEM</ReviewedBy>
<ValidatedBy>SYSTEM</ValidatedBy>
<ErrorMessage/>
<Value>HYPER SHIPPING"SDN BHD_First_Page</Value><DocumentDisplayInfo/>
<DocumentLevelFields/>
<Pages>
<Page>
<Identifier>PG0</Identifier>
<OldFileName>HYPER-KL FEB-0001-0001.tif</OldFileName>
<NewFileName>BI2E7_0.tif</NewFileName>
<SourceFileID>1</SourceFileID>
<PageLevelFields>
<PageLevelField>
<Name>Search_Engine_Classification</Name>
<Value>Park Street "10 road</Value><Type/>
<Confidence>66.23</Confidence>
<LearnedFileName>HYPER KL-JUN-0001.tif</LearnedFileName>
<OcrConfidenceThreshold>0.0</OcrConfidenceThreshold>
<OcrConfidence>0.0</OcrConfidence>
<FieldOrderNumber>0</FieldOrderNumber>
<ForceReview>false</ForceReview>
</PageLevelField>
</PageLevelFields>
</Page>
</Pages>
</Document>
更多详情代码,请访问this repo
,您需要在单引号和双引号
上添加反斜杠value =value.replace("\'","\"");
,
只需将 removeQuote 方法替换为
private static void removeQuote(Document batchXml) throws JDOMException,Exception {
Element root = batchXml.getRootElement();
List<Element> docs = root.getChild("Documents").getChildren("Document");
for (Element doc : docs) {
String docType = doc.getChildText("Value");
value =value.replaceAll("\'","\"");
}
}