一个简单的java网络爬虫,由于时间原因,没有进一步解释.
需要的htmlparser.jar包到官方网上去下.
---------------------------------------------Spider.java-----------------------------------------------------------------
import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import org.htmlparser.RemarkNode; import org.htmlparser.StringNode; import org.htmlparser.Node; import org.htmlparser.tags.*; import org.htmlparser.Parser; import org.htmlparser.filters.StringFilter; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import java.util.Queue; import java.util.LinkedList;
public class Spider implements Runnable {
boolean search_key_words = false;
int count = 0;
int limitsite = 10;
int countsite = 1;
String keyword = "中国";//搜索关键字
Parser parser = new Parser();
// List linklist = new ArrayList();
String startsite = "";//搜索的其实站点
SearchResultBean srb;//保存搜索结果
List resultlist = new ArrayList();//搜索到关键字链接列表
List searchedsite = new ArrayList();//已经被搜索站点列表
Queue linklist = new LinkedList();//需解析的链接列表
HashMap<String, ArrayList<String>> disallowListCache = new HashMap<String, ArrayList<String>>();
public Spider(String keyword, String startsite) { this.keyword = keyword; this.startsite = startsite; linklist.add(startsite);
srb = new SearchResultBean();
}
public void run() {
// TODO Auto-generated method stub
search(linklist);
}
public void search(Queue queue) {
String url = "";
while(!queue.isEmpty()){ url = queue.peek().toString();//查找列队
try {
if (!isSearched(searchedsite, url)) {
if (isRobotAllowed(new URL(url)))//检查该链接是否被允许搜索
processHtml(url);
else
System.out.println("this page is disallowed to search");
}
} catch (Exception ex) {
}
queue.remove();
}
}
/**
*解析HTML
* @param url
* @throws ParserException
* @throws Exception
*/
public void processHtml(String url) throws ParserException, Exception { searchedsite.add(url);
count = 0;
System.out.println("searching ... :" + url); parser.setURL(url); parser.setEncoding("GBK"); URLConnection uc = parser.getConnection(); uc.connect(); //uc.getLastModified(); NodeIterator nit = parser.elements();
while (nit.hasMoreNodes()) { Node node = nit.nextNode();
parserNode(node);
}
srb.setKeywords(keyword); srb.setUrl(url); srb.setCount_key_words(count); resultlist.add(srb); System.out.println("count keywords is :" + count); System.out.println("----------------------------------------------");
}
[1] [2] [3] [4] 下一页