lucene 은 tika 와 함께 사용 합 니 다.

1.어떤 디 렉 터 리 에 있 는 모든 파일 가 져 오기

public class DirectoryUtil {
	
	/**
	 *             
	 * @param file
	 * @return
	 */
	public static List<String> listFiles(File file,List<String> list){
		if(list == null){
			list = new LinkedList<String>();
		}
		
		if(!file.isDirectory()){
			list.add(file.getName());
		}else{
//			System.out.println("---------------"+file.getName()+"    ---------------");
			File[] files = file.listFiles();
			for(File tempfile : files){
				if(tempfile.isDirectory()){
					DirectoryUtil.listFiles(tempfile, list);
				}
//				System.out.println(tempfile.getName());
//				System.out.println(tempfile.getPath());
				list.add(tempfile.getPath());
			}
		}
		
		return list;
	}
	
	
	/**
	 *         
	 * @param obj
	 */
	public void print(Object obj){
		if(obj instanceof List){
			List list = (List)obj;
			Iterator it = list.iterator();
			while(it.hasNext()){
				Object tempObj = it.next();
				System.out.println(tempObj);
			}
		}
	}
	
	@Test
	public void listFilesTest(){
		DirectoryUtil util = new DirectoryUtil();
		File file = new File("H:/baiduyundownload");
		List list = util.listFiles(file,null);
		util.print(list);
	}
	
}

2.파일 에서 정 보 를 추출 하여 Lucene 의 Document 대상 에 추가

public class TikaExtractFile {

	public static Document getInformationInFile(File file){
		InputStream stream = null;
		Document doc = null;
		Tika tika = new Tika();
		try {
			//        ——              parser
			AutoDetectParser autoDetectParser = new AutoDetectParser();
			//     
			stream = new FileInputStream(file);
			Reader reader = tika.parse(file);
			//    Metadata  ， parser         Metadata  
			Metadata metadata = new Metadata();
			//    SAX  
			ParseContext context = new ParseContext();
			////     100000 ，new BodyContentHandler(1024*1024*1024);  
//			BodyContentHandler handler = new BodyContentHandler();
//			BodyContentHandler handler = new BodyContentHandler(1024*1024*1024);
			//WriteOutContentHandler           
			BodyContentHandler handler = new BodyContentHandler(new WriteOutContentHandler(1024*1024*1024));
			autoDetectParser.parse(stream, handler, metadata, context);
			doc = new Document();
			doc.add(new Field("filename", file.getName(), Field.Store.YES, Field.Index.ANALYZED));
			doc.add(new Field("address", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));
			//        ，               
			if(file.getName().indexOf(".")>0){
				doc.add(new Field("filetype", file.getName().substring(file.getName().indexOf(".")), Field.Store.YES, Field.Index.NOT_ANALYZED));
			}else{
				System.out.println("type : " + metadata.get("Content-Type"));
				String type = metadata.get("Content-Type").split("\\/")[1];
				doc.add(new Field("filetype", type, Field.Store.YES, Field.Index.NOT_ANALYZED));
			}
//			doc.add(new Field("filecontent", new InputStreamReader(stream), Field.Store.NO, Field.Index.ANALYZED));
			doc.add(new Field("filecontent",reader));
			return doc;
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (SAXException e) {
			e.printStackTrace();
		} catch (TikaException e) {
			e.printStackTrace();
		}finally{
			if(stream != null){
				try {
					stream.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return null;
	}
	
}

3.파일 정 보 를 색인 에 기록 합 니 다.

public class IndexUtil {

	//    
	public void index(){
		try {
			//lucene       
			String dirpath = "g:/mylucene";
			File file = new File(dirpath);
			//          （    ）
			Directory directory = FSDirectory.open(file);
//			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new StandardAnalyzer(Version.LUCENE_35)); 
			IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer("G:\\c    \\lucene\\mmseg4j-1.8.5\\data")); 
			//       
			IndexWriter writer = new IndexWriter(directory, config);
			File dir = new File("H:/baiduyundownload");
			//          
			List<String> list = DirectoryUtil.listFiles(dir, null);
			Iterator<String> it = list.iterator();
			while(it.hasNext()){
				//     Iterator       ，               
				String path = it.next();
				File tempfile = new File(path);
				Document doc = null;
				if(tempfile.isFile()){
					//  Tika        ，  Document  
					 doc = TikaExtractFile.getInformationInFile(tempfile);
					 // document        
					 writer.addDocument(doc);  
				}
			}
			writer.close();
			
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
	
		
	@Test
	public void test1(){
		String str = "type : application/xml";
		System.out.println(str.split("\\/")[1]);
		
	}
	
	public static void main(String[] args) {
		IndexUtil util = new IndexUtil();
		util.index();
	}
	
}

이 내용에 흥미가 있습니까?

현재 기사가 여러분의 문제를 해결하지 못하는 경우 AI 엔진은 머신러닝 분석(스마트 모델이 방금 만들어져 부정확한 경우가 있을 수 있음)을 통해 가장 유사한 기사를 추천합니다:

Elasticsearch 호출 Lucene 쿼리 인터페이스 원본 분석 6: 접두사 쿼리(Prefix)

소개 조회 문법 원본 분석 접두사 조회는 설정에 있어서 단어 조회와 유사하다.접두사 검색은 이러한 문서와 일치할 수 있습니다. 이 문서의 특정 필드는 주어진 접두사로 시작됩니다. 예: 모든 제목 필드가cri로 시작하...

텍스트를 자유롭게 공유하거나 복사할 수 있습니다.하지만 이 문서의 URL은 참조 URL로 남겨 두십시오.

CC BY-SA 2.5, CC BY-SA 3.0 및 CC BY-SA 4.0에 따라 라이센스가 부여됩니다.

lucene 은 tika 와 함께 사용 합 니 다.

좋은 웹페이지 즐겨찾기