Solr索引pdf.txt.word等文件

原創

zml19910925

2020-02-21 10:38

這裏用的solr4.7

首先搭建環境

創建一個新core

這裏有詳細的資料

http://blog.csdn.net/clj198606061111/article/details/21288499/

修改core0裏面的xml

schema.xml

加入

 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

<field name="text"      type="text_general" indexed="true"  stored="true"/>
  <field name="_version_" type="long"         indexed="true"  stored="true"/> 
  <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>

 
 
  <!-- general -->
  <field name="id"        type="string"   indexed="true"  stored="true"  multiValued="false" required="true"/>
  <field name="type"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="name"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="fileName"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
    <field name="path"      type="string"   indexed="true"  stored="true"  multiValued="false" />

修改solrconfig,xml

  <requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler" >
   <lst name="defaults">
    <str name="fmap.content">text</str>
    <str name="lowernames">true</str>
    <str name="uprefix">attr_</str>
    <str name="captureAttr">true</str>
   </lst>
  </requestHandler>

加入相應的jar包

solr-4.7.2\dist solr-cell-4.7.2

solr-4.7.2\contrib\extraction 所有的jar包

然後引入jar包

在solrconfig.xml文件

  <lib dir="../extract" regex=".*\.jar" />

重啓tomcat

數據準備

然後代碼編寫

public static void main(String[] args) {
		File parentFile = new File("G:/document/");
		if (parentFile.exists()) {
			File[] files = parentFile.listFiles();
			for (File file : files) {
				try {
					indexFilesSolrCell(file.getName(), file.getPath());
				} catch (IOException e) {
					e.printStackTrace();
				} catch (SolrServerException e) {
					e.printStackTrace();
				}
			}
		}
	}

	/**
	 * 從文件創建索引 <功能詳細描述>
	 * 
	 * @param fileName
	 * @param solrId
	 * @see [類、類#方法、類#成員]
	 */
	public static void indexFilesSolrCell(String fileName, String path)
			throws IOException, SolrServerException {
		//連接solr服務
		String urlString = "http://localhost:8080/solr/core0";
		SolrServer solr = new HttpSolrServer(urlString);
		
		ContentStreamUpdateRequest up = new ContentStreamUpdateRequest(
				"/update/extract");
		
		String contentType = getFileContentType(fileName);
		up.addFile(new File(path), contentType);
		up.setParam("literal.id", fileName);
		up.setParam("literal.path", path);
		up.setParam("literal.fileName", fileName);
		up.setParam("fmap.content", "attr_content");
		up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);

		/*
		 * up.addFile(file, contenttype); up.setParam("literal.id", id);
		 * up.setParam("literal.mytitle", mytitle);
		 * up.setParam("literal.mytime", dataTurntoLong(savetime));
		 * up.setParam("literal.myindextype", myindextype);
		 * up.setParam("literal.myyears", myyears); up.setParam("fmap.content",
		 * "content");
		 */
		solr.request(up);

		QueryResponse rsp = solr.query(new SolrQuery("*:*"));

		SolrDocumentList solrDocumentList = rsp.getResults();

		ListIterator<SolrDocument> listIterator = solrDocumentList
				.listIterator();
		while (listIterator.hasNext()) {
			SolrDocument solrDocument = listIterator.next();
			System.out.println(solrDocument.getFieldValue("attr_filename"));
		}

	}

	/**
	 * 根據文件名獲取文件的ContentType類型
	 * 
	 * @param filename
	 * @return
	 */
	public static String getFileContentType(String filename) {
		String contentType = "";
		String prefix = filename.substring(filename.lastIndexOf(".") + 1);
		if (prefix.equals("xlsx")) {
			contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
		} else if (prefix.equals("pdf")) {
			contentType = "application/pdf";
		} else if (prefix.equals("doc")) {
			contentType = "application/msword";
		} else if (prefix.equals("txt")) {
			contentType = "text/plain";
		} else if (prefix.equals("xls")) {
			contentType = "application/vnd.ms-excel";
		} else if (prefix.equals("docx")) {
			contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
		} else if (prefix.equals("ppt")) {
			contentType = "application/vnd.ms-powerpoint";
		} else if (prefix.equals("pptx")) {
			contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
		}

		else {
			contentType = "othertype";
		}

		return contentType;
	}

solr操作

zml19910925

發佈了36 篇原創文章 · 獲贊 3 · 訪問量 2萬+

私信關注

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Solr索引pdf.txt.word等文件

[轉帖]使用NMT和pmap解決JVM資源泄漏問題原創

Python實現大麥網搶票的四大關鍵技術點解析

Python 安裝庫指令大全

salesforce零基礎學習（一百三十八）零碎知識點小總結（十）

一款開源的.NET程序集反編譯、編輯和調試神器

關於接口協議，你必須要知道這些！

基於 Milvus + LlamaIndex 實現高級 RAG

【2024-05-21】以茶會友

log4j的優先級解讀與簡單實例

springmvc集成動態數據源

springmvc集成mongo

java動態代理解讀

solr集成mysql

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結