use std::sync::{Arc, RwLock};
use std::thread;
use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
fn main() -> tantivy::Result<()> {
It is fairly common to have to index from different threads.
Tantivy forbids to create more than one IndexWriter
at a time.
This IndexWriter
itself has its own multithreaded layer, so managing your own
indexing threads will not help. However, it can still be useful for some applications.
For instance, if preparing documents to send to tantivy before indexing is the bottleneck of your application, it is reasonable to have multiple threads.
Another very common reason to want to index from multiple threads, is implementing a webserver with CRUD capabilities. The server framework will most likely handle request from different threads.
The recommended way to address both of these use case is to wrap your IndexWriter
into a
Arc<RwLock<IndexWriter>>
.
While this is counterintuitive, adding and deleting documents do not require mutability
over the IndexWriter
, so several threads will be able to do this operation concurrently.
The example below does not represent an actual real-life use case (who would spawn thread to index a single document?), but aims at demonstrating the mechanism that makes indexing from several threads possible.
Importing tantivy…
use std::sync::{Arc, RwLock};
use std::thread;
use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::{doc, Index, IndexWriter, Opstamp, TantivyError};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT | STORED);
let body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
let index_writer_clone_1 = index_writer.clone();
thread::spawn(move || {
we index 100 times the document… for the sake of the example.
for i in 0..100 {
let opstamp = index_writer_clone_1
.read().unwrap() //< A read lock is sufficient here.
.add_document(
doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20));
}
Result::<(), TantivyError>::Ok(())
});
let index_writer_clone_2 = index_writer.clone();
For convenience, tantivy also comes with a macro to reduce the boilerplate above.
thread::spawn(move || {
we index 100 times the document… for the sake of the example.
for i in 0..100 {
A read lock is sufficient here.
let opstamp = {
let index_writer_rlock = index_writer_clone_2.read().unwrap();
index_writer_rlock.add_document(doc!(
title => "Manufacturing consent",
body => "Some great book description..."
))?
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10));
}
Result::<(), TantivyError>::Ok(())
});
for _ in 0..10 {
let opstamp: Opstamp = {
Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit()?
};
println!("committed with opstamp {opstamp}");
thread::sleep(Duration::from_millis(500));
}
Ok(())
}