lucene 은 tika 와 함께 사용 합 니 다.

5888 단어 Lucene

1.어떤 디 렉 터 리 에 있 는 모든 파일 가 져 오기
public class DirectoryUtil {
	
	/**
	 *             
	 * @param file
	 * @return
	 */
	public static List<String> listFiles(File file,List<String> list){
		if(list == null){
			list = new LinkedList<String>();
		}
		
		if(!file.isDirectory()){
			list.add(file.getName());
		}else{
//			System.out.println("---------------"+file.getName()+"    ---------------");
			File[] files = file.listFiles();
			for(File tempfile : files){
				if(tempfile.isDirectory()){
					DirectoryUtil.listFiles(tempfile, list);
				}
//				System.out.println(tempfile.getName());
//				System.out.println(tempfile.getPath());
				list.add(tempfile.getPath());
			}
		}
		
		return list;
	}
	
	
	/**
	 *         
	 * @param obj
	 */
	public void print(Object obj){
		if(obj instanceof List){
			List list = (List)obj;
			Iterator it = list.iterator();
			while(it.hasNext()){
				Object tempObj = it.next();
				System.out.println(tempObj);
			}
		}
	}
	
	@Test
	public void listFilesTest(){
		DirectoryUtil util = new DirectoryUtil();
		File file = new File("H:/baiduyundownload");
		List list = util.listFiles(file,null);
		util.print(list);
	}
	
}

 
2.파일 에서 정 보 를 추출 하여 Lucene 의 Document 대상 에 추가
public class TikaExtractFile {

	public static Document getInformationInFile(File file){
		InputStream stream = null;
		Document doc = null;
		Tika tika = new Tika();
		try {
			//        ——              parser
			AutoDetectParser autoDetectParser = new AutoDetectParser();
			//     
			stream = new FileInputStream(file);
			Reader reader = tika.parse(file);
			//    Metadata  , parser         Metadata  
			Metadata metadata = new Metadata();
			//    SAX  
			ParseContext context = new ParseContext();
			////     100000 ,new BodyContentHandler(1024*1024*1024);  
//			BodyContentHandler handler = new BodyContentHandler();
//			BodyContentHandler handler = new BodyContentHandler(1024*1024*1024);
			//WriteOutContentHandler           
			BodyContentHandler handler = new BodyContentHandler(new WriteOutContentHandler(1024*1024*1024));
			autoDetectParser.parse(stream, handler, metadata, context);
			doc = new Document();
			doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.ANALYZED));
			doc.add(new Field("address", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			//        ,               
			if(file.getName().indexOf(".")>0){
				doc.add(new Field("filetype", file.getName().substring(file.getName().indexOf(".")), Field.Store.YES, Field.Index.NOT_ANALYZED));
			}else{
				System.out.println("type : " + metadata.get("Content-Type"));
				String type = metadata.get("Content-Type").split("\\/")[1];
				doc.add(new Field("filetype", type, Field.Store.YES, Field.Index.NOT_ANALYZED));
			}
//			doc.add(new Field("filecontent", new InputStreamReader(stream), Field.Store.NO, Field.Index.ANALYZED));
			doc.add(new Field("filecontent",reader));
			return doc;
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();
		} catch (TikaException e) {
			e.printStackTrace();
		}finally{
			if(stream != null){
				try {
					stream.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return null;
	}
	
}

 
3.파일 정 보 를 색인 에 기록 합 니 다.
public class IndexUtil {

	//    
	public void index(){
		try {
			//lucene       
			String dirpath = "g:/mylucene";
			File file = new File(dirpath);
			//          (    )
			Directory directory = FSDirectory.open(file);
//			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); 
			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer("G:\\c    \\lucene\\mmseg4j-1.8.5\\data")); 
			//       
			IndexWriter writer = new IndexWriter(directory, config);
			File dir = new File("H:/baiduyundownload");
			//          
			List<String> list = DirectoryUtil.listFiles(dir, null);
			Iterator<String> it = list.iterator();
			while(it.hasNext()){
				//     Iterator       ,               
				String path = it.next();
				File tempfile = new File(path);
				Document doc = null;
				if(tempfile.isFile()){
					//  Tika        ,  Document  
					 doc = TikaExtractFile.getInformationInFile(tempfile);
					 // document        
					 writer.addDocument(doc);  
				}
			}
			writer.close();
			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
		
	@Test
	public void test1(){
		String str = "type : application/xml";
		System.out.println(str.split("\\/")[1]);
		
	}
	
	public static void main(String[] args) {
		IndexUtil util = new IndexUtil();
		util.index();
	}
	
}

 
 
 
 
 
 
 
 

좋은 웹페이지 즐겨찾기