代码很简单, 直接看就明白了, 可以在实际工作中借鉴, 原文在
这里. 这个例子使用两种方式来演示如何生成全量索引:
一个是从db中通过sql生成全量索引
一个是通过tika解析文件生成全量索引
package SolrJExample;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.*;
import java.util.ArrayList;
import java.util.Collection;
/* Example class showing the skeleton of using Tika and
Sql on the client to index documents from
both structured documents and a SQL database.
NOTE: The SQL example and the Tika example are entirely orthogonal.
Both are included here to make a
more interesting example, but you can omit either of them.
*/
public class SqlTikaExample {
private StreamingUpdateSolrServer _server;
private long _start = System.currentTimeMillis();
private AutoDetectParser _autoParser;
private int _totalTika = 0;
private int _totalSql = 0;
private Collection _docs = new ArrayList();
public static void main(String[] args) {
try {
SqlTikaExample idxer = new SqlTikaExample("http://localhost:8983/solr");
idxer.doTikaDocuments(new File("/Users/Erick/testdocs"));
idxer.doSqlDocuments();
idxer.endIndexing();
} catch (Exception e) {
e.printStackTrace();
}
}
private SqlTikaExample(String url) throws IOException, SolrServerException {
// Create a multi-threaded communications channel to the Solr server.
// Could be CommonsHttpSolrServer as well.
//
_server = new StreamingUpdateSolrServer(url, 10, 4);
_server.setSoTimeout(1000); // socket read timeout
_server.setConnectionTimeout(1000);
_server.setMaxRetries(1); // defaults to 0. > 1 not recommended.
// binary parser is used by default for responses
_server.setParser(new XMLResponseParser());
// One of the ways Tika can be used to attempt to parse arbitrary files.
_autoParser = new AutoDetectParser();
}
// Just a convenient place to wrap things up.
private void endIndexing() throws IOException, SolrServerException {
if (_docs.size() > 0) { // Are there any documents left over?
_server.add(_docs, 300000); // Commit within 5 minutes
}
_server.commit(); // Only needs to be done at the end,
// commitWithin should do the rest.
// Could even be omitted
// assuming commitWithin was specified.
long endTime = System.currentTimeMillis();
log("Total Time Taken: " + (endTime - _start) +
" milliseconds to index " + _totalSql +
" SQL rows and " + _totalTika + " documents");
}
// I hate writing System.out.println() everyplace,
// besides this gives a central place to convert to true logging
// in a production system.
private static void log(String msg) {
System.out.println(msg);
}
/**
* ***************************Tika processing here
*/
// Recursively traverse the filesystem, parsing everything found.
private void doTikaDocuments(File root) throws IOException, SolrServerException {
// Simple loop for recursively indexing all the files
// in the root directory passed in.
for (File file : root.listFiles()) {
if (file.isDirectory()) {
doTikaDocuments(file);
continue;
}
// Get ready to parse the file.
ContentHandler textHandler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
InputStream input = new FileInputStream(file);
// Try parsing the file. Note we haven't checked at all to
// see whether this file is a good candidate.
try {
_autoParser.parse(input, textHandler, metadata, context);
} catch (Exception e) {
// Needs better logging of what went wrong in order to
// track down "bad" documents.
log(String.format("File %s failed", file.getCanonicalPath()));
e.printStackTrace();
continue;
}
// Just to show how much meta-data and what form it's in.
dumpMetadata(file.getCanonicalPath(), metadata);
// Index just a couple of the meta-data fields.
SolrInputDocument doc = new SolrInputDocument();
doc.addField("id", file.getCanonicalPath());
// Crude way to get known meta-data fields.
// Also possible to write a simple loop to examine all the
// metadata returned and selectively index it and/or
// just get a list of them.
// One can also use the LucidWorks field mapping to
// accomplish much the same thing.
String author = metadata.get("Author");
if (author != null) {
doc.addField("author", author);
}
doc.addField("text", textHandler.toString());
_docs.add(doc);
++_totalTika;
// Completely arbitrary, just batch up more than one document
// for throughput!
if (_docs.size() >= 1000) {
// Commit within 5 minutes.
UpdateResponse resp = _server.add(_docs, 300000);
if (resp.getStatus() != 0) {
log("Some horrible error has occurred, status is: " +
resp.getStatus());
}
_docs.clear();
}
}
}
// Just to show all the metadata that's available.
private void dumpMetadata(String fileName, Metadata metadata) {
log("Dumping metadata for file: " + fileName);
for (String name : metadata.names()) {
log(name + ":" + metadata.get(name));
}
log("\n\n");
}
/**
* ***************************SQL processing here
*/
private void doSqlDocuments() throws SQLException {
Connection con = null;
try {
Class.forName("com.mysql.jdbc.Driver").newInstance();
log("Driver Loaded");
con = DriverManager.getConnection("jdbc:mysql://192.168.1.103:3306/test?"
+ "user=testuser&password=test123");
Statement st = con.createStatement();
ResultSet rs = st.executeQuery("select id,title,text from test");
while (rs.next()) {
// DO NOT move this outside the while loop
// or be sure to call doc.clear()
SolrInputDocument doc = new SolrInputDocument();
String id = rs.getString("id");
String title = rs.getString("title");
String text = rs.getString("text");
doc.addField("id", id);
doc.addField("title", title);
doc.addField("text", text);
_docs.add(doc);
++_totalSql;
// Completely arbitrary, just batch up more than one
// document for throughput!
if (_docs.size() > 1000) {
// Commit within 5 minutes.
UpdateResponse resp = _server.add(_docs, 300000);
if (resp.getStatus() != 0) {
log("Some horrible error has occurred, status is: " +
resp.getStatus());
}
_docs.clear();
}
}
} catch (Exception ex) {
ex.printStackTrace();
} finally {
if (con != null) {
con.close();
}
}
}
}