JAVA利用HTMLParser实现爬虫程序源码

  categories:资料  author:

来源:互联网

JAVA利用HTMLParser实现爬虫程序源码 

package my.url.test;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class MyUrl {
private  URL url ;
private File file;
private static String dirName;
private static int depth = 0;
private static int number = 0;
private static HttpURLConnection urlConnectin ;

public static void main(String[] args){
try {
String [] urlNames = new String[]{
“http://www.ebiotrade.com/newsf/”,
};
init(urlNames);
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (ParserException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
}

public static void  init(String[] urlNames) throws Exception{
for(int i=0;i<urlNames.length;i++){
MyUrl myUrl = new MyUrl(urlNames);
myUrl.connect();
myUrl.html2txt(urlConnectin,dirName);
}
}

public MyUrl(String urlString) throws URISyntaxException, ParserException, IOException{
url= new URL(urlString);
if(url.getPath().equals(“”) || url.getPath().equals(“/”)){
dirName = url.toURI().getHost();
}else{
dirName = url.toURI().getHost()+””+url.toURI().getPath();
}
CreateDir(dirName);
}

private void CreateDir(String dirName) throws URISyntaxException{
//创建目录
String[] dirNames = dirName.split(“/”);
String temp = “”;
for(int i=0;i<((dirNames.length == 4)?dirNames.length – 1:dirNames.length);i++){
temp += dirNames+”/”;
File dir = new File(“c:/”+ temp);
if(dir.exists() == false){
System.out.println(“创建文件夹:”+dirNames);
dir.mkdir();
}
}
}

private void  connect() throws Exception{
urlConnectin = (HttpURLConnection)url.openConnection();
urlConnectin.setRequestProperty(“User-Agent”, “Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)”);
urlConnectin.setRequestProperty(“Accept”,”image/gif,image/x-xbitmap, image/jpeg, image/pjpeg,application/x-shockwave-flash, application/vnd.ms-powerpoint,application/vnd.ms-excel, application/msword, */*”);
urlConnectin.setRequestProperty(“Accept-Language”, “zh-cn”);
urlConnectin.setRequestProperty(“UA-CPU”, “x86″);
//urlConnectin.setRequestProperty(“Accept-Encoding”, “gzip”);//为什么没有deflate呢
urlConnectin.setRequestProperty(“Content-type”, “text/html”);
urlConnectin.setRequestProperty(“Connection”, “close”);
urlConnectin.setUseCaches(false);//不要用cache,用了也没有什么用,因为我们不会经常对一个链接频繁访问。(针对程序)
//urlConnectin.setConnectTimeout(10 * 1000);
//urlConnectin.setReadTimeout(10 * 1000);
urlConnectin.setDoOutput(true);
urlConnectin.setDoInput(true);
if(depth == 0){
analyzeHTML(urlConnectin,”gb2312″);
}

}

private void readAndWrite(HttpURLConnection urlConnectin,StringfileName) throws IOException, ParserException, URISyntaxException{
if(url.getPath().equals(“”)){
file = new File(“c:/”+dirName+”/index.html”);
}else{
file = new File(“c:/”+dirName);
}
if(file.exists() == false){
number ++;
System.out.println(“下载:”+number);
System.out.println(file.getPath());
file.createNewFile();
FileWriter fw = new FileWriter(file);
BufferedWriter bw = new BufferedWriter(fw);
String inputLine;
BufferedReader in = new BufferedReader( new InputStreamReader(urlConnectin.getInputStream()));

while ((inputLine = in.readLine()) != null) {
bw.write(inputLine);
bw.newLine();
bw.flush();
}
}
}
//解析出连接
private void analyzeHTML(HttpURLConnection urlConnectin,String encode) throws Exception{
depth = 1;
Parser parser = new Parser(urlConnectin);
parser.setEncoding(encode);
NodeFilter filter = new AndFilter(new TagNameFilter(“a”),new HasAttributeFilter(“class”,”TDcenter”));
NodeList nodeList = parser.parse(filter);
NodeIterator it = nodeList.elements();
while(it.hasMoreNodes()){
Node node = it.nextNode();
init(new String[]{((LinkTag)node).getLink()});
}
}
//下载图片
private void downloadImageAnalyzeHtml(URLConnection urlConnectin,String encode) throws Exception{
Parser parser = new Parser(urlConnectin);
parser.setEncoding(encode);
NodeFilter filter = new TagNameFilter(“img”);
NodeList nodeList = parser.parse(filter);
NodeIterator it = nodeList.elements();
while(it.hasMoreNodes()){
Node node = it.nextNode();
if(((ImageTag)node).getImageURL().toString().startsWith(“http”) == false){
String tempDirectory = url.getHost()+((ImageTag)node).getImageURL().replace(“file:”, “”);
CreateDir(tempDirectory);
URLConnectionDownloader.download(“http://”+tempDirectory,”c:/”+tempDirectory);
}
}
}
//解析成txt文件
private void html2txt(HttpURLConnection urlConnectin,String dirName) throws Exception{
File dirTxt = new File(“c:/”+dirName+”.txt”);
if(dirTxt.exists() == false){
Parser parser = new Parser(urlConnectin);
parser.setEncoding(“gb2312″);

NodeFilter filter =new OrFilter(new NodeFilter[]{newTagNameFilter(“p”),new HasAttributeFilter(“class”,”MsoNormal”),newAndFilter(new TagNameFilter(“span”),newHasAttributeFilter(“class”,”newsf”))});
NodeList  nodeList = parser.parse(filter);
NodeIterator it = nodeList.elements();
FileWriter fw = new FileWriter(dirTxt);
BufferedWriter bw = new BufferedWriter(fw);
if(it.hasMoreNodes()){
dirTxt.createNewFile();
System.out.println(“创建文件:”+dirTxt);
}
while(it.hasMoreNodes()){
Node node = it.nextNode();
bw.write(node.toHtml());
bw.newLine();
bw.flush();
}
bw.close();
downloadImageAnalyzeHtml((dirTxt.toURI().toURL().openConnection()),”gb2312″);
}
}

}
//以下类来源网络
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
/**
* 使用URLConnection下载文件或图片并保存到本地。
*
* @author 老紫竹(laozizhu.com)
*/
public class URLConnectionDownloader {
public static void main(String[] args) throws Exception {
download(“http://www.cnbeta.com/images/cnlogo.gif”, “cnlogo.gif”);
}
/**
* 下载文件到本地
*
* @param urlString
*           被下载的文件地址
* @param filename
*           本地文件名
* @throws Exception
*            各种异常
*/
public static void download(String urlString, String filename) throws Exception {
// 构造URL
URL url = new URL(urlString);
// 打开连接
URLConnection con = url.openConnection();
// 输入流
InputStream is = con.getInputStream();
//  的数据缓冲
byte[] bs = new byte[1024] ;
// 读取到的数据长度
int len;
// 输出的文件流
OutputStream os = new FileOutputStream(filename);
// 开始读取
while ((len = is.read(bs)) != -1)  {
os.write(bs, 0, len);
}
// 完毕,关闭所有链接
os.close();
is.close();
}
}



快乐成长 每天进步一点点