html parser, 경량급 웹 캡 처 분석 도구
1868 단어 HtmlParser
public static void main(String[] args) throws IOException, ParserException {
String site = "http://tech.qq.com/a/20131112/011680.htm";
String site2="http://www.chinanews.com/gn/2013/11-12/5492942.shtml";
URL url = new URL(site2);
URLConnection urlConnection = url.openConnection();
Parser parser = new Parser(urlConnection);
parser.setEncoding("GBK");
/*TextExtractingVisitor visitor = new TextExtractingVisitor();
parser.visitAllNodesWith(visitor);
String textInPage = visitor.getExtractedText();*/
/* AndFilter andFilter = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("id","Cnt-Main-Article-QQ"));
NodeList nodes = parser.parse(andFilter);
System.out.println("html:["+nodes.toHtml()+"]");*/
//CssSelectorNodeFilter cssSelectorNodeFilter = new CssSelectorNodeFilter("#Cnt-Main-Article-QQ");
CssSelectorNodeFilter cssSelectorNodeFilter = new CssSelectorNodeFilter(".left_zw");
NodeList nodes2 = parser.parse(cssSelectorNodeFilter);
System.out.println("html:["+nodes2.toHtml()+"]");
//logger.info("text:["+textInPage+"]");
logger.info("ok");
}
해당 pom 은:
<dependency>
<groupId>org.htmlparser</groupId>
<artifactId>htmlparser</artifactId>
<version>2.1</version>
</dependency>