有了以上幾篇的概念,修改了一下程式碼來建立從html裡抓出資料建立縮引
有點像Google Search,只是爬蟲(抓資料)部份非常簡單而已
程式碼如下:
package testlucene;
import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.Date; import org.apache.lucene.document.Document; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter;
public class LuceneIndexHtml {
private IndexWriter writer = null;
// 做測試的網址 private String url[] = { "http://catyku.pixnet.net/blog/post/22417532", "http://catyku.pixnet.net/blog/post/22393052", "http://catyku.pixnet.net/blog/post/22561736" };
private Document doc = null;
public LuceneIndexHtml() { try { // 建立index的寫入器 // 使用標準的分詞器 // 重新建立索引檔,也就是之前的檔案會全數重建 writer = new IndexWriter("d:\\index", new StandardAnalyzer(), true); doc = new Document(); } catch (Exception e) {
} }
private void setDocument(BufferedReader reader, String link) { Field field = new Field("content", reader); // 加入index文件檔裡,無法取得資訊 doc.add(field); // 加入資訊,不做分詞,search不到此資訊,但可以取得 doc.add(new Field("url", link, Field.Store.YES, Field.Index.UN_TOKENIZED)); }
private Document getDocument() {
return doc;
}
// 把url的資料全都加入index裡 public void writeToIndex() throws IOException {
for (int i = 0; i < url.length; i++) { String sURL = url[i]; // String sURL = "http://localhost:8080/TestJSP/index1.txt"; URL lurl = new URL(sURL); URLConnection URLConn = (HttpURLConnection) lurl.openConnection(); URLConn.setRequestProperty("User-agent", "IE/6.0"); BufferedReader in = new BufferedReader(new InputStreamReader( URLConn.getInputStream())); System.out.println(url[i]);
setDocument(in, url[i]); // in.close(); } writer.addDocument(getDocument()); }
// 記得要關閉才會真的寫入檔案喔 public void close() throws IOException { writer.close(); }
public static void main(String args[]) throws IOException { LuceneIndexHtml indexer = new LuceneIndexHtml(); Date start = new Date(); indexer.writeToIndex(); Date end = new Date(); System.out.println("建立索引用時" + (end.getTime() - start.getTime()) + "毫杪"); indexer.close(); }
} |
再來寫一份測試的Search,可以參考
原始碼如下:
package testlucene; import java.util.Date; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.Hits; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; public class LuceneSearch { private IndexSearcher searcher = null; private Query query = null; private Analyzer analyzer = new StandardAnalyzer(); public LuceneSearch() { try { // 建立查詢器 searcher = new IndexSearcher(IndexReader.open("d:\\index")); } catch (Exception e) { } } // hits是查詢的結果集 public final Hits search(String keyword) throws Exception { System.out.println("正在檢索關鍵字 : " + keyword); // 建立要查詢的("目標",分析器) QueryParser qp = new QueryParser("content", analyzer); // parse(查詢字) ,keyword可以用空白格開,理論上會查出 // A AND B ,A,B 三種結果,放入同一個結果集 // AND OR 大寫是logic判斷用 query = qp.parse(keyword); Date start = new Date(); // hits是查詢的結果集 Hits hits = searcher.search(query); Date end = new Date(); System.out .println("檢索完成,用時" + (end.getTime() - start.getTime()) + "毫杪"); return hits; } // 列印結果集 public void printResult(Hits h) { if (h.length() == 0) { System.out.println("對不起!沒有您要找的資料!"); } else { // hits的length是查到的所有結果 for (int i = 0; i < h.length(); i++) { try { // 取得第n個查詢結果,此處get("contents")會是null, // 因為content是查詢用欄位 // 而get("path")則是敘述欄位 // 請在建立index時就定義好 Document doc = h.doc(i); // System.out.println("這是第"+(i+1)+"個檢索到的結果,檔案為 : // "+doc.get("path")); System.out.println(doc.get("url")); } catch (Exception e) { e.printStackTrace(); } } } System.out.println("---------------------------"); } public static void main(String[] args) throws Exception { LuceneSearch test = new LuceneSearch(); test.printResult(test.search("FileUpload")); } } |
其它參考資料
喵嗚
這個不簡單啦Orz...Comment Permissions: Allow commenting