• Jump To … +
    aggregation.rs basic_search.rs custom_collector.rs custom_tokenizer.rs date_time_field.rs deleting_updating_documents.rs faceted_search.rs faceted_search_with_tweaked_score.rs fuzzy_search.rs index_from_multiple_threads.rs index_with_json.rs integer_range_search.rs ip_field.rs iterating_docs_and_positions.rs json_field.rs phrase_prefix_search.rs pre_tokenized_text.rs snippet.rs stop_words.rs warmer.rs
  • §

    Aggregation example

    This example shows how you can use built-in aggregations. We will use nested aggregations with buckets and metrics:

    • Range buckets and compute the average in each bucket.
    • Term aggregation and compute the min price in each bucket

  • §
    use serde_json::{Deserializer, Value};
    use tantivy::aggregation::agg_req::Aggregations;
    use tantivy::aggregation::agg_result::AggregationResults;
    use tantivy::aggregation::AggregationCollector;
    use tantivy::query::AllQuery;
    use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
    use tantivy::{Index, IndexWriter, TantivyDocument};
    
    fn main() -> tantivy::Result<()> {
  • §

    Create Schema

    Lets create a schema for a footwear shop, with 4 fields: name, category, stock and price. category, stock and price will be fast fields as that’s the requirement for aggregation queries.

        let mut schema_builder = Schema::builder();
  • §

    In preparation of the TermsAggregation, the category field is configured with:

    • set_fast
    • raw tokenizer

    The tokenizer is set to “raw”, because the fast field uses the same dictionary as the inverted index. (This behaviour will change in tantivy 0.20, where the fast field will always be raw tokenized independent from the regular tokenizing)

        let text_fieldtype = schema::TextOptions::default()
            .set_indexing_options(
                TextFieldIndexing::default()
                    .set_index_option(IndexRecordOption::WithFreqs)
                    .set_tokenizer("raw"),
            )
            .set_fast(None)
            .set_stored();
        schema_builder.add_text_field("category", text_fieldtype);
        schema_builder.add_f64_field("stock", FAST);
        schema_builder.add_f64_field("price", FAST);
    
        let schema = schema_builder.build();
  • §

    Indexing documents

    Lets index a bunch of documents for this example.

        let index = Index::create_in_ram(schema.clone());
    
        let data = r#"{
            "name": "Almond Toe Court Shoes, Patent Black",
            "category": "Womens Footwear",
            "price": 99.00,
            "stock": 5
        }
        {
            "name": "Suede Shoes, Blue",
            "category": "Womens Footwear",
            "price": 42.00,
            "stock": 4
        }
        {
            "name": "Leather Driver Saddle Loafers, Tan",
            "category": "Mens Footwear",
            "price": 34.00,
            "stock": 12
        }
        {
            "name": "Flip Flops, Red",
            "category": "Mens Footwear",
            "price": 19.00,
            "stock": 6
        }
        {
            "name": "Flip Flops, Blue",
            "category": "Mens Footwear",
            "price": 19.00,
            "stock": 0
        }
        {
            "name": "Gold Button Cardigan, Black",
            "category": "Womens Casualwear",
            "price": 167.00,
            "stock": 6
        }
        {
            "name": "Cotton Shorts, Medium Red",
            "category": "Womens Casualwear",
            "price": 30.00,
            "stock": 5
        }
        {
            "name": "Fine Stripe Short SleeveShirt, Grey",
            "category": "Mens Casualwear",
            "price": 49.99,
            "stock": 9
        }
        {
            "name": "Fine Stripe Short SleeveShirt, Green",
            "category": "Mens Casualwear",
            "price": 49.99,
            "offer": 39.99,
            "stock": 9
        }
        {
            "name": "Sharkskin Waistcoat, Charcoal",
            "category": "Mens Formalwear",
            "price": 75.00,
            "stock": 2
        }
        {
            "name": "Lightweight Patch PocketBlazer, Deer",
            "category": "Mens Formalwear",
            "price": 175.50,
            "stock": 1
        }
        {
            "name": "Bird Print Dress, Black",
            "category": "Womens Formalwear",
            "price": 270.00,
            "stock": 10
        }
        {
            "name": "Mid Twist Cut-Out Dress, Pink",
            "category": "Womens Formalwear",
            "price": 540.00,
            "stock": 5
        }"#;
    
        let stream = Deserializer::from_str(data).into_iter::<Value>();
    
        let mut index_writer: IndexWriter = index.writer(50_000_000)?;
        let mut num_indexed = 0;
        for value in stream {
            let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
            index_writer.add_document(doc)?;
            num_indexed += 1;
            if num_indexed > 4 {
  • §

    Writing the first segment

                index_writer.commit()?;
            }
        }
  • §

    Writing the second segment

        index_writer.commit()?;
  • §

    We have two segments now. The AggregationCollector will run the aggregation on each segment and then merge the results into an IntermediateAggregationResult.

        let reader = index.reader()?;
        let searcher = reader.searcher();
  • §

  • §

    Aggregation Query

    We can construct the query by building the request structure or by deserializing from JSON. The JSON API is more stable and therefore recommended.

    Request 1

        let agg_req_str = r#"
        {
          "group_by_stock": {
            "aggs": {
              "average_price": { "avg": { "field": "price" } }
            },
            "range": {
              "field": "stock",
              "ranges": [
                { "key": "few", "to": 1.0 },
                { "key": "some", "from": 1.0, "to": 10.0 },
                { "key": "many", "from": 10.0 }
              ]
            }
          }
        } "#;
  • §

    In this Aggregation we want to get the average price for different groups, depending on how many items are in stock. We define custom ranges few, some, many via the range aggregation. For every bucket we want the average price, so we create a nested metric aggregation on the range bucket aggregation. Only buckets support nested aggregations.

    Request JSON API

        let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
        let collector = AggregationCollector::from_aggs(agg_req, Default::default());
  • §

    We use the AllQuery which will pass all documents to the AggregationCollector.

        let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
        let res: Value = serde_json::to_value(agg_res)?;
  • §

    Aggregation Result

    The resulting structure deserializes in the same JSON format as elastic search.

        let expected_res = r#"
        {
            "group_by_stock":{
                "buckets":[
                    {"average_price":{"value":19.0},"doc_count":1,"key":"few","to":1.0},
                    {"average_price":{"value":124.748},"doc_count":10,"from":1.0,"key":"some","to":10.0},
                    {"average_price":{"value":152.0},"doc_count":2,"from":10.0,"key":"many"}
                ]
            }
        }
        "#;
        let expected_json: Value = serde_json::from_str(expected_res)?;
        assert_eq!(expected_json, res);
  • §

    Request 2

    Now we are interested in the minimum price per category, so we create a bucket per category via TermsAggregation. We are interested in the highest minimum prices, and set the order of the buckets "order": { "min_price": "desc" } to be sorted by the the metric of the sub aggregation. (awesome)

        let agg_req_str = r#"
        {
          "min_price_per_category": {
            "aggs": {
              "min_price": { "min": { "field": "price" } }
            },
            "terms": {
              "field": "category",
              "min_doc_count": 1,
              "order": { "min_price": "desc" }
            }
          }
        } "#;
    
        let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
    
        let collector = AggregationCollector::from_aggs(agg_req, Default::default());
    
        let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
        let res: Value = serde_json::to_value(agg_res)?;
  • §

    Minimum price per category, sorted by minimum price descending

    As you can see, the starting prices for Formalwear are higher than Casualwear.

        let expected_res = r#"
        {
          "min_price_per_category": {
            "buckets": [
              { "doc_count": 2, "key": "Womens Formalwear", "min_price": { "value": 270.0 } },
              { "doc_count": 2, "key": "Mens Formalwear", "min_price": { "value": 75.0 } },
              { "doc_count": 2, "key": "Mens Casualwear", "min_price": { "value": 49.99 } },
              { "doc_count": 2, "key": "Womens Footwear", "min_price": { "value": 42.0 } },
              { "doc_count": 2, "key": "Womens Casualwear", "min_price": { "value": 30.0 } },
              { "doc_count": 3, "key": "Mens Footwear", "min_price": { "value": 19.0 } }
            ],
            "sum_other_doc_count": 0
          }
        }
        "#;
        let expected_json: Value = serde_json::from_str(expected_res)?;
    
        assert_eq!(expected_json, res);
    
        Ok(())
    }