htmlcleaner 使用示例.
原文出处:http://blog.chenlb.com/2008/11/htmlcleaner-use-demo.html
<!-- google_ad_section_start -->编程的时候,有时数据源从html来。那就要对html分析提取数据。好在java社区里有好有相关库来解析html,经使用比较:个人觉得 htmlcleaner 比 htmlparser 好用。htmlcleaner 的 xpath特好用。也可能我对htmlparser不熟悉。
htmlcleaner 下载地址:htmlcleaner2_1.jar 源码下载:htmlcleaner2_1-all.zip
写一个测试用的html文件:html-clean-demo.html
package
 com.chenlb;  
  
import
 java.io.File;  
  
import
 org.htmlcleaner.HtmlCleaner;  
import
 org.htmlcleaner.TagNode;  
  
/**
 
 * htmlcleaner 使用示例.
 
 *
 
 * @author chenlb 2008-11-26 下午02:12:02
 
 */
  
public
 
class
 HtmlCleanerDemo {  
  
    public
 
static
 
void
 main(String[] args) 
throws
 Exception {  
        HtmlCleaner cleaner = new
 HtmlCleaner();  
  
        TagNode node = cleaner.clean(new
 File(
"html/html-clean-demo.html"
), 
"GBK"
);  
        //按tag取.
  
        Object[] ns = node.getElementsByName("title"
, 
true
);    
//标题
  
  
        if
(ns.length > 
0
) {  
            System.out.println("title="
+((TagNode)ns[
0
]).getText());  
        }  
        System.out.println("ul/li:"
);  
        //按xpath取
  
        ns = node.evaluateXPath("//div[@class='d_1']//li"
);  
        for
(Object on : ns) {  
            TagNode n = (TagNode) on;  
            System.out.println("\ttext="
+n.getText());  
        }  
        System.out.println("a:"
);  
        //按属性值取
  
        ns = node.getElementsByAttValue("name"
, 
"my_href"
, 
true
, 
true
);  
        for
(Object on : ns) {  
            TagNode n = (TagNode) on;  
            System.out.println("\thref="
+n.getAttributeByName(
"href"
)+
", text="
+n.getText());  
        }  
    }  
}  package com.chenlb;
import java.io.File;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
/**
 * htmlcleaner 使用示例.
 *
 * @author chenlb 2008-11-26 下午02:12:02
 */
public class HtmlCleanerDemo {
	public static void main(String[] args) throws Exception {
		HtmlCleaner cleaner = new HtmlCleaner();
		TagNode node = cleaner.clean(new File("html/html-clean-demo.html"), "GBK");
		//按tag取.
		Object[] ns = node.getElementsByName("title", true);	//标题
		if(ns.length > 0) {
			System.out.println("title="+((TagNode)ns[0]).getText());
		}
		System.out.println("ul/li:");
		//按xpath取
		ns = node.evaluateXPath("//div[@class='d_1']//li");
		for(Object on : ns) {
			TagNode n = (TagNode) on;
			System.out.println("\ttext="+n.getText());
		}
		System.out.println("a:");
		//按属性值取
		ns = node.getElementsByAttValue("name", "my_href", true, true);
		for(Object on : ns) {
			TagNode n = (TagNode) on;
			System.out.println("\thref="+n.getAttributeByName("href")+", text="+n.getText());
		}
	}
}cleaner.clean()中的参数,可以是文件,可以是url,可以是字符串内容。个人认为:比较常用的应该是evaluateXPath、getElementsByAttValue、getElementsByName方法了。另外说明下,htmlcleaner 对不规范的html兼容性比较好。
相关推荐
  lupeng    2020-11-14  
   sjcheck    2020-11-10  
   nercon    2020-08-09  
   pythonclass    2020-07-29  
   玫瑰小妖    2020-07-18  
   WebVincent    2020-07-09  
   lyg0    2020-07-05  
   WebVincent    2020-06-16  
   huzijia    2020-06-16  
   qsdnet我想学编程    2020-06-13  
   pythonclass    2020-06-06  
   nercon    2020-06-06  
   gufudhn    2020-06-06  
   STPace    2020-06-04  
   HSdiana    2020-06-03  
   haocxy    2020-05-31  
   行吟阁    2020-05-30  
   haocxy    2020-05-28  
 