Solr索引pdf.txt.word等文件

這裏用的solr4.7


首先搭建環境

    創建一個新core

    這裏有詳細的資料

    http://blog.csdn.net/clj198606061111/article/details/21288499/


修改core0裏面的xml

  schema.xml

加入

 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

<field name="text"      type="text_general" indexed="true"  stored="true"/>
  <field name="_version_" type="long"         indexed="true"  stored="true"/> 
  <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>

 
 
  <!-- general -->
  <field name="id"        type="string"   indexed="true"  stored="true"  multiValued="false" required="true"/>
  <field name="type"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="name"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
  <field name="fileName"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 
    <field name="path"      type="string"   indexed="true"  stored="true"  multiValued="false" /> 

修改solrconfig,xml

     

  <requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler" >
   <lst name="defaults">
    <str name="fmap.content">text</str>
    <str name="lowernames">true</str>
    <str name="uprefix">attr_</str>
    <str name="captureAttr">true</str>
   </lst>
  </requestHandler>



加入相應的jar包

 

  solr-4.7.2\dist    solr-cell-4.7.2

  solr-4.7.2\contrib\extraction  所有的jar包


然後引入jar包

    在solrconfig.xml文件

  <lib dir="../extract" regex=".*\.jar" />

重啓tomcat


數據準備

   

然後代碼編寫 

public static void main(String[] args) {
		File parentFile = new File("G:/document/");
		if (parentFile.exists()) {
			File[] files = parentFile.listFiles();
			for (File file : files) {
				try {
					indexFilesSolrCell(file.getName(), file.getPath());
				} catch (IOException e) {
					e.printStackTrace();
				} catch (SolrServerException e) {
					e.printStackTrace();
				}
			}
		}
	}

	/**
	 * 從文件創建索引 <功能詳細描述>
	 * 
	 * @param fileName
	 * @param solrId
	 * @see [類、類#方法、類#成員]
	 */
	public static void indexFilesSolrCell(String fileName, String path)
			throws IOException, SolrServerException {
		//連接solr服務
		String urlString = "http://localhost:8080/solr/core0";
		SolrServer solr = new HttpSolrServer(urlString);
		
		ContentStreamUpdateRequest up = new ContentStreamUpdateRequest(
				"/update/extract");
		
		String contentType = getFileContentType(fileName);
		up.addFile(new File(path), contentType);
		up.setParam("literal.id", fileName);
		up.setParam("literal.path", path);
		up.setParam("literal.fileName", fileName);
		up.setParam("fmap.content", "attr_content");
		up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);

		/*
		 * up.addFile(file, contenttype); up.setParam("literal.id", id);
		 * up.setParam("literal.mytitle", mytitle);
		 * up.setParam("literal.mytime", dataTurntoLong(savetime));
		 * up.setParam("literal.myindextype", myindextype);
		 * up.setParam("literal.myyears", myyears); up.setParam("fmap.content",
		 * "content");
		 */
		solr.request(up);

		QueryResponse rsp = solr.query(new SolrQuery("*:*"));

		SolrDocumentList solrDocumentList = rsp.getResults();

		ListIterator<SolrDocument> listIterator = solrDocumentList
				.listIterator();
		while (listIterator.hasNext()) {
			SolrDocument solrDocument = listIterator.next();
			System.out.println(solrDocument.getFieldValue("attr_filename"));
		}

	}

	/**
	 * 根據文件名獲取文件的ContentType類型
	 * 
	 * @param filename
	 * @return
	 */
	public static String getFileContentType(String filename) {
		String contentType = "";
		String prefix = filename.substring(filename.lastIndexOf(".") + 1);
		if (prefix.equals("xlsx")) {
			contentType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
		} else if (prefix.equals("pdf")) {
			contentType = "application/pdf";
		} else if (prefix.equals("doc")) {
			contentType = "application/msword";
		} else if (prefix.equals("txt")) {
			contentType = "text/plain";
		} else if (prefix.equals("xls")) {
			contentType = "application/vnd.ms-excel";
		} else if (prefix.equals("docx")) {
			contentType = "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
		} else if (prefix.equals("ppt")) {
			contentType = "application/vnd.ms-powerpoint";
		} else if (prefix.equals("pptx")) {
			contentType = "application/vnd.openxmlformats-officedocument.presentationml.presentation";
		}

		else {
			contentType = "othertype";
		}

		return contentType;
	}

  solr操作

    



   

    

   

發佈了36 篇原創文章 · 獲贊 3 · 訪問量 2萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章