Skip to contents

Class to construct the BM25 search object

Methods


Method new()

Creates a new instance of a BM25 class

Usage

BM25$new(data = NULL, lang = "detect", k1 = 1.2, b = 0.75, metadata = NULL)

Arguments

data

text data, a vector of strings. Note any preprocessing steps (tolower, removing stopwords etc) need to have taken place before this!

lang

language of the data, see self$available_languages(), can also be "detect" to automatically detect the language, default is "detect"

k1

k1 parameter of BM25, default is 1.2

b

b parameter of BM25, default is 0.75

metadata

a data.frame with metadata for each document, default is NULL must be a data.frame with the same number of rows containing arbitrary metadata for each document, e.g. a file path or a URL

Returns

BM25 object

Examples

corpus <- c(
 "The rabbit munched the orange carrot.",
 "The snake hugged the green lizard.",
 "The hedgehog impaled the orange orange.",
 "The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
                 metadata = data.frame(src = paste("file", 1:4)))
bm25
bm25$get_data()

bm25$query("orange", max_n = 2)
bm25$query("orange", max_n = 3)
bm25$query("orange") # return all, same as max_n = Inf or NULL


Method available_languages()

Returns the available languages

Usage

BM25$available_languages()

Returns

a named character vector with language codes and their full names

Examples

BM25$new()$available_languages()


Method get_data()

Returns the data

Usage

BM25$get_data(add_metadata = TRUE)

Arguments

add_metadata

whether to add metadata to the data, default is TRUE

Returns

a data.frame with the data and metadata if available and selected

Examples

BM25$new(data = letters, metadata = LETTERS)$get_data()


Method get_lang()

Returns the language used

Usage

BM25$get_lang()

Returns

a character string with the language code

Examples

BM25$new()$get_lang()
BM25$new(lang = "en")$get_lang()
BM25$new(lang = "detect")$get_lang()


Method print()

Prints a BM25 object

Usage

BM25$print(n = 5, nchar = 20)

Arguments

n

number of data to print, default is 5

nchar

number of characters to print for each text, default is 20

Returns

the object invisible

Examples

BM25$new(data = letters, metadata = LETTERS)


Method add_data()

Adds data to the BM25 object

This can be useful to add more data later on, note this will rebuild the engine.

Usage

BM25$add_data(data, metadata = NULL)

Arguments

data

a vector of strings

metadata

a data.frame with metadata for each document, default is NULL

Returns

NULL

Examples

bm25 <- BM25$new()
bm25$add_data(letters, metadata = LETTERS)
bm25


Method query()

Query the BM25 object for the N best matches

Usage

BM25$query(query, max_n = NULL, return_text = TRUE, return_metadata = TRUE)

Arguments

query

the term to search for, note all preprocessing that was applied to the text corpus initially needs to be already performed on the term, e.g., tolower, removing stopwords etc

max_n

the maximum number of results to return, default is all

return_text

whether to return the text, default is TRUE

return_metadata

whether to return metadata, default is TRUE

Returns

a data.frame with the results

Examples

corpus <- c(
 "The rabbit munched the orange carrot.",
 "The snake hugged the green lizard.",
 "The hedgehog impaled the orange orange.",
 "The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
                 metadata = data.frame(src = paste("file", 1:4)))

bm25$query("orange", max_n = 2)
bm25$query("orange", max_n = 3)
bm25$query("orange", return_text = FALSE, return_metadata = FALSE)
bm25$query("orange", max_n = 3)


Method clone()

The objects of this class are cloneable with this method.

Usage

BM25$clone(deep = FALSE)

Arguments

deep

Whether to make a deep clone.

Examples

corpus <- c(
  "The rabbit munched the orange carrot.",
  "The snake hugged the green lizard.",
  "The hedgehog impaled the orange orange.",
  "The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
                 metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
bm25$query("orange")
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
#> 3  2 0.0000000    3      The snake hugged the green lizard. file 2
#> 4  4 0.0000000    3      The squirrel buried the brown nut. file 4

## ------------------------------------------------
## Method `BM25$new`
## ------------------------------------------------

corpus <- c(
 "The rabbit munched the orange carrot.",
 "The snake hugged the green lizard.",
 "The hedgehog impaled the orange orange.",
 "The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
                 metadata = data.frame(src = paste("file", 1:4)))
bm25
#> <BM25 (k1: 1.20, b: 0.75)> with 4 documents (language: 'English')
#>   - Data & Metadata 
#>                                      text metadata.src
#> 1   The rabbit munched the orange carrot.       file 1
#> 2      The snake hugged the green lizard.       file 2
#> 3 The hedgehog impaled the orange orange.       file 3
#> 4      The squirrel buried the brown nut.       file 4
bm25$get_data()
#>                                      text    src
#> 1   The rabbit munched the orange carrot. file 1
#> 2      The snake hugged the green lizard. file 2
#> 3 The hedgehog impaled the orange orange. file 3
#> 4      The squirrel buried the brown nut. file 4

bm25$query("orange", max_n = 2)
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
bm25$query("orange", max_n = 3)
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
#> 3  2 0.0000000    3      The snake hugged the green lizard. file 2
bm25$query("orange") # return all, same as max_n = Inf or NULL
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
#> 3  2 0.0000000    3      The snake hugged the green lizard. file 2
#> 4  4 0.0000000    3      The squirrel buried the brown nut. file 4

## ------------------------------------------------
## Method `BM25$available_languages`
## ------------------------------------------------

BM25$new()$available_languages()
#>           ar           da           nl           en           fr           de 
#>     "arabic"     "danish"      "dutch"    "english"     "french"     "german" 
#>           el           hu           it           no           pt           ro 
#>      "greek"  "hungarian"    "italian"  "norwegian" "portuguese"   "romanian" 
#>           ru           es           sv           ta           tr         auto 
#>    "russian"    "spanish"    "swedish"      "tamil"    "turkish"     "detect" 

## ------------------------------------------------
## Method `BM25$get_data`
## ------------------------------------------------

BM25$new(data = letters, metadata = LETTERS)$get_data()
#>    text metadata
#> 1     a        A
#> 2     b        B
#> 3     c        C
#> 4     d        D
#> 5     e        E
#> 6     f        F
#> 7     g        G
#> 8     h        H
#> 9     i        I
#> 10    j        J
#> 11    k        K
#> 12    l        L
#> 13    m        M
#> 14    n        N
#> 15    o        O
#> 16    p        P
#> 17    q        Q
#> 18    r        R
#> 19    s        S
#> 20    t        T
#> 21    u        U
#> 22    v        V
#> 23    w        W
#> 24    x        X
#> 25    y        Y
#> 26    z        Z

## ------------------------------------------------
## Method `BM25$get_lang`
## ------------------------------------------------

BM25$new()$get_lang()
#> [1] "Detect"
BM25$new(lang = "en")$get_lang()
#> [1] "English"
BM25$new(lang = "detect")$get_lang()
#> [1] "Detect"

## ------------------------------------------------
## Method `BM25$print`
## ------------------------------------------------

BM25$new(data = letters, metadata = LETTERS)
#> <BM25 (k1: 1.20, b: 0.75)> with 26 documents (language: 'Detect')
#>   - Data & Metadata 
#>   text metadata.metadata
#> 1    a                 A
#> 2    b                 B
#> 3    c                 C
#> 4    d                 D
#> 5    e                 E
#> ... ommited 6 entries (total 26)

## ------------------------------------------------
## Method `BM25$add_data`
## ------------------------------------------------

bm25 <- BM25$new()
bm25$add_data(letters, metadata = LETTERS)
bm25
#> <BM25 (k1: 1.20, b: 0.75)> with 26 documents (language: 'Detect')
#>   - Data & Metadata 
#>   text metadata.metadata
#> 1    a                 A
#> 2    b                 B
#> 3    c                 C
#> 4    d                 D
#> 5    e                 E
#> ... ommited 6 entries (total 26)

## ------------------------------------------------
## Method `BM25$query`
## ------------------------------------------------

corpus <- c(
 "The rabbit munched the orange carrot.",
 "The snake hugged the green lizard.",
 "The hedgehog impaled the orange orange.",
 "The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
                 metadata = data.frame(src = paste("file", 1:4)))

bm25$query("orange", max_n = 2)
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
bm25$query("orange", max_n = 3)
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
#> 3  2 0.0000000    3      The snake hugged the green lizard. file 2
bm25$query("orange", return_text = FALSE, return_metadata = FALSE)
#>   id     score rank
#> 1  3 0.4904281    1
#> 2  1 0.3566750    2
#> 3  2 0.0000000    3
#> 4  4 0.0000000    3
bm25$query("orange", max_n = 3)
#>   id     score rank                                    text    src
#> 1  3 0.4904281    1 The hedgehog impaled the orange orange. file 3
#> 2  1 0.3566750    2   The rabbit munched the orange carrot. file 1
#> 3  2 0.0000000    3      The snake hugged the green lizard. file 2