Jump To … +

aggregation.rs basic_search.rs custom_collector.rs custom_tokenizer.rs date_time_field.rs deleting_updating_documents.rs faceted_search.rs faceted_search_with_tweaked_score.rs fuzzy_search.rs index_from_multiple_threads.rs index_with_json.rs integer_range_search.rs ip_field.rs iterating_docs_and_positions.rs json_field.rs phrase_prefix_search.rs pre_tokenized_text.rs snippet.rs stop_words.rs warmer.rs

§

Custom collector example

This example shows how you can implement your own collector. As an example, we will compute a collector that computes the standard deviation of a given fast field.

Of course, you can have a look at the tantivy’s built-in collectors such as the CountCollector for more examples.
```
use columnar::Column;
```
§

Importing tantivy…

use tantivy::collector::{Collector, SegmentCollector};
use tantivy::index::SegmentReader;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score};

#[derive(Default)]
struct Stats {
    count: usize,
    sum: f64,
    squared_sum: f64,
}

impl Stats {
    pub fn count(&self) -> usize {
        self.count
    }

    pub fn mean(&self) -> f64 {
        self.sum / (self.count as f64)
    }

    fn square_mean(&self) -> f64 {
        self.squared_sum / (self.count as f64)
    }

    pub fn standard_deviation(&self) -> f64 {
        let mean = self.mean();
        (self.square_mean() - mean * mean).sqrt()
    }

    fn non_zero_count(self) -> Option<Stats> {
        if self.count == 0 {
            None
        } else {
            Some(self)
        }
    }
}

struct StatsCollector {
    field: String,
}

impl StatsCollector {
    fn with_field(field: String) -> StatsCollector {
        StatsCollector { field }
    }
}

impl Collector for StatsCollector {

That’s the type of our result. Our standard deviation will be a float.

    type Fruit = Option<Stats>;

    type Child = StatsSegmentCollector;

    fn for_segment(
        &self,
        _segment_local_id: u32,
        segment_reader: &SegmentReader,
    ) -> tantivy::Result<StatsSegmentCollector> {
        let fast_field_reader = segment_reader.fast_fields().u64(&self.field)?;
        Ok(StatsSegmentCollector {
            fast_field_reader,
            stats: Stats::default(),
        })
    }

    fn requires_scoring(&self) -> bool {

this collector does not care about score.

        false
    }

    fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
        let mut stats = Stats::default();
        for segment_stats in segment_stats.into_iter().flatten() {
            stats.count += segment_stats.count;
            stats.sum += segment_stats.sum;
            stats.squared_sum += segment_stats.squared_sum;
        }
        Ok(stats.non_zero_count())
    }
}

struct StatsSegmentCollector {
    fast_field_reader: Column,
    stats: Stats,
}

impl SegmentCollector for StatsSegmentCollector {
    type Fruit = Option<Stats>;

    fn collect(&mut self, doc: u32, _score: Score) {

Since we know the values are single value, we could call first_or_default_col on the column and fetch single values.

        for value in self.fast_field_reader.values_for_doc(doc) {
            let value = value as f64;
            self.stats.count += 1;
            self.stats.sum += value;
            self.stats.squared_sum += value * value;
        }
    }

    fn harvest(self) -> <Self as SegmentCollector>::Fruit {
        self.stats.non_zero_count()
    }
}

fn main() -> tantivy::Result<()> {

§

Defining the schema

The Tantivy index requires a very strict schema. The schema declares which fields are in the index, and for each field, its type and “the way it should be indexed”.

first we need to define a schema …

    let mut schema_builder = Schema::builder();

We’ll assume a fictional index containing products, and with a name, a description, and a price.

    let product_name = schema_builder.add_text_field("name", TEXT);
    let product_description = schema_builder.add_text_field("description", TEXT);
    let price = schema_builder.add_u64_field("price", INDEXED | FAST);
    let schema = schema_builder.build();

Indexing documents

Lets index a bunch of fake documents for the sake of this example.

    let index = Index::create_in_ram(schema);

    let mut index_writer: IndexWriter = index.writer(50_000_000)?;
    index_writer.add_document(doc!(
        product_name => "Super Broom 2000",
        product_description => "While it is ok for short distance travel, this broom \
        was designed quiditch. It will up your game.",
        price => 30_200u64
    ))?;
    index_writer.add_document(doc!(
        product_name => "Turbulobroom",
        product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
            You'll enjoy its sharp turns, and rapid acceleration",
        price => 29_240u64
    ))?;
    index_writer.add_document(doc!(
        product_name => "Broomio",
        product_description => "Great value for the price. This broom is a market favorite",
        price => 21_240u64
    ))?;
    index_writer.add_document(doc!(
        product_name => "Whack a Mole",
        product_description => "Prime quality bat.",
        price => 5_200u64
    ))?;
    index_writer.commit()?;

    let reader = index.reader()?;
    let searcher = reader.searcher();
    let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);

here we want to search for broom and use StatsCollector on the hits.

    let query = query_parser.parse_query("broom")?;
    if let Some(stats) =
        searcher.search(&query, &StatsCollector::with_field("price".to_string()))?
    {
        println!("count: {}", stats.count());
        println!("mean: {}", stats.mean());
        println!("standard deviation: {}", stats.standard_deviation());
    }

    Ok(())
}

Custom collector example

Defining the schema

Indexing documents