• Jump To … +
    aggregation.rs basic_search.rs custom_collector.rs custom_tokenizer.rs date_time_field.rs deleting_updating_documents.rs faceted_search.rs faceted_search_with_tweaked_score.rs fuzzy_search.rs index_from_multiple_threads.rs index_with_json.rs integer_range_search.rs ip_field.rs iterating_docs_and_positions.rs json_field.rs phrase_prefix_search.rs pre_tokenized_text.rs snippet.rs stop_words.rs warmer.rs
  • §

    Stop Words Example

    This example covers the basic usage of stop words with tantivy

    We will :

    • define our schema
    • create an index in a directory
    • add a few stop words
    • index few documents in our index
  • §

  • §

    Importing tantivy…

    use tantivy::collector::TopDocs;
    use tantivy::query::QueryParser;
    use tantivy::schema::*;
    use tantivy::tokenizer::*;
    use tantivy::{doc, Index, IndexWriter};
    
    fn main() -> tantivy::Result<()> {
  • §

    this example assumes you understand the content in basic_search

        let mut schema_builder = Schema::builder();
  • §

    This configures your custom options for how tantivy will store and process your content in the index; The key to note is that we are setting the tokenizer to stoppy which will be defined and registered below.

        let text_field_indexing = TextFieldIndexing::default()
            .set_tokenizer("stoppy")
            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
        let text_options = TextOptions::default()
            .set_indexing_options(text_field_indexing)
            .set_stored();
  • §

    Our first field is title.

        schema_builder.add_text_field("title", text_options);
  • §

    Our second field is body.

        let text_field_indexing = TextFieldIndexing::default()
            .set_tokenizer("stoppy")
            .set_index_option(IndexRecordOption::WithFreqsAndPositions);
        let text_options = TextOptions::default()
            .set_indexing_options(text_field_indexing)
            .set_stored();
        schema_builder.add_text_field("body", text_options);
    
        let schema = schema_builder.build();
    
        let index = Index::create_in_ram(schema.clone());
  • §

    This tokenizer lowers all of the text (to help with stop word matching) then removes all instances of the and and from the corpus

        let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
            .filter(LowerCaser)
            .filter(StopWordFilter::remove(vec![
                "the".to_string(),
                "and".to_string(),
            ]))
            .build();
    
        index.tokenizers().register("stoppy", tokenizer);
    
        let mut index_writer: IndexWriter = index.writer(50_000_000)?;
    
        let title = schema.get_field("title").unwrap();
        let body = schema.get_field("body").unwrap();
    
        index_writer.add_document(doc!(
        title => "The Old Man and the Sea",
        body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
         he had gone eighty-four days now without taking a fish."
        ))?;
    
        index_writer.add_document(doc!(
        title => "Of Mice and Men",
        body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
                over the yellow sands in the sunlight before reaching the narrow pool. On one \
                side of the river the golden foothill slopes curve up to the strong and rocky \
                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
                fresh and green with every spring, carrying in their lower leaf junctures the \
                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
                limbs and branches that arch over the pool"
        ))?;
    
        index_writer.add_document(doc!(
        title => "Frankenstein",
        body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
                 enterprise which you have regarded with such evil forebodings.  I arrived here \
                 yesterday, and my first task is to assure my dear sister of my welfare and \
                 increasing confidence in the success of my undertaking."
        ))?;
    
        index_writer.commit()?;
    
        let reader = index.reader()?;
    
        let searcher = reader.searcher();
    
        let query_parser = QueryParser::for_index(&index, vec![title, body]);
  • §

    stop words are applied on the query as well. The following will be equivalent to title:frankenstein

        let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
        let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
    
        for (score, doc_address) in top_docs {
            let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
            println!("\n==\nDocument score {score}:");
            println!("{}", retrieved_doc.to_json(&schema));
        }
    
        Ok(())
    }