`
xinklabi
  • 浏览: 1555955 次
  • 性别: Icon_minigender_1
  • 来自: 吉林
文章分类
社区版块
存档分类
最新评论

HTML文件转换成XML文件

阅读更多

HTML文件轉換成XML文件

import java.io.BufferedInputStream;

import java.io.FileOutputStream;

import java.io.FileWriter;

import java.io.IOException;

import java.io.PrintWriter;

import java.net.URL;

import org.w3c.tidy.Tidy;

public class HTML2XML {

private String url;

private String outFileName;

private String errOutFileName;

public HTML2XML(String url, String outFileName, String errOutFileName){

this.url = url;//提供一個輸入的URL

this.outFileName = outFileName;//輸出文件

this.errOutFileName = errOutFileName;//錯誤文件

}

public static void main(String[] args){

//參數:HTML文件的UTL,輸出文件和錯誤文件名

System.out.println("程序開始運行……");

HTML2XML t = new HTML2XML("http://hswebuat01/maximo/help.html",

"c:\\temp\\html2xml.xml","c:\\temp\\err.txt");

t.convert();

System.out.println("程序運行結束……");

}

public void convert(){

URL u;

BufferedInputStream in;

FileOutputStream out;

Tidy tidy = new Tidy();

tidy.setXmlOut(true);//告訴TidyHTML轉換成XML

try{

tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName),true));//將錯誤信息保存到文件中

u = new URL(url);

 

in = new BufferedInputStream(u.openStream());//創建一個輸入輸出流

out = new FileOutputStream(outFileName);

tidy.parse(in, out);//轉換文件

in.close();

out.close();

}catch(IOException e){

System.out.println(this.toString()+e.toString());

}

}

}

 

 

 

 

/Files/Snowfun/Html2XML.zip

/Files/Snowfun/jtidy-r938.zip

要点:

(1)BufferedInputStream()的应用

(2)java扩展标准库org.w3c.tidy.Tidy的使用

(3)URL的使用

注意:

该程序中用到的org.w3c.tidy包在http://sourceforge.net/projects/jtidy/下载。解压后引用build文件夹中的tidy包就ok了。

 

下面是解决转换过程中的乱码问题:

首先将源网页用UTF-8重新编码放到一个新的文件,

还要注意加上:

tidy.setInputEncoding("UTF-8");

才能正确显示

源代码如下:


 

import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.*;


import org.w3c.tidy.Tidy;

public class xml {
private String url;
private String outFileName;
private String errOutFileName;

public xml(String url, String outFileName, String
errOutFileName) {
this.url = url;
this.outFileName = outFileName;
this.errOutFileName = errOutFileName;
}

public void convert() {
URL u;
BufferedInputStream in;
FileOutputStream out;

Logger log = Logger.getLogger("convert");
try {
u = new URL(url);

//Create input and output streams
in = new BufferedInputStream(u.openStream()); // 打开文件,转换为 UTF-8 编码
InputStreamReader isr = new InputStreamReader(in, "GB2312"); // 源文件编码为 gb2312

File tmpNewFile = File.createTempFile("GB2312",".html"); // 转换后的文件,设定编码为 utf-8
out = new FileOutputStream( tmpNewFile ); // 需要将文件转换为字符流
OutputStreamWriter osw = new OutputStreamWriter( out , "UTF-8"); // 指定目标编码为 utf-8
osw.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");

char[] buffer = new char[10240]; // 文件缓冲区
int len = 0; // 使用字符读取方式,循环读取源文件内容
while( (len = isr.read(buffer)) !=-1 ) // 转换后写入目标文件中
{
osw.write( buffer, 0, len);
}
osw.close(); // 转换完成
isr.close();
out.close();
in.close();

if( log.isLoggable( Level.INFO)){
log.info("HTML 文档转 UTF-8 编码完成!");
}

//设置tidy
Tidy tidy = new Tidy();
// Set file for error messages
tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
// Tell Tidy to convert HTML to XML
tidy.setXmlOut(true);
tidy.setInputEncoding("UTF-8");
FileInputStream in0 = new FileInputStream( tmpNewFile );
FileOutputStream out0 = new FileOutputStream(outFileName);

//Convert files
tidy.parse(in0, out0);

//Clean up
in.close();
out.close();
tmpNewFile.delete(); // 删除临时文件

} catch (IOException e) {
System.out.println(this.toString() + e.toString());
}
}

public static void main(String[] args) {
/*
* Parameters are:
* URL of HTML file
* Filename of output file
* Filename of error file
*/

String u="http://www.baidu.com/";
String o="index.xml";
String e="error.xml";

xml t = new xml(u, o, e);
t.convert();
System.out.println("OK!");

}
}

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics