Class to construct the BM25 search object
Methods
Method new()
Creates a new instance of a BM25 class
Usage
BM25$new(data = NULL, lang = "detect", k1 = 1.2, b = 0.75, metadata = NULL)Arguments
datatext data, a vector of strings. Note any preprocessing steps (tolower, removing stopwords etc) need to have taken place before this!
langlanguage of the data, see self$available_languages(), can also be "detect" to automatically detect the language, default is "detect"
k1k1 parameter of BM25, default is 1.2
bb parameter of BM25, default is 0.75
metadataa data.frame with metadata for each document, default is NULL must be a data.frame with the same number of rows containing arbitrary metadata for each document, e.g. a file path or a URL
Examples
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25
bm25$get_data()
bm25$query("orange", max_n = 2)
bm25$query("orange", max_n = 3)
bm25$query("orange") # return all, same as max_n = Inf or NULLMethod available_languages()
Returns the available languages
Examples
BM25$new()$available_languages()Method print()
Prints a BM25 object
Arguments
nnumber of data to print, default is 5
ncharnumber of characters to print for each text, default is 20
Examples
BM25$new(data = letters, metadata = LETTERS)Method add_data()
Adds data to the BM25 object
This can be useful to add more data later on, note this will rebuild the engine.
Arguments
dataa vector of strings
metadataa data.frame with metadata for each document, default is NULL
Examples
bm25 <- BM25$new()
bm25$add_data(letters, metadata = LETTERS)
bm25Method query()
Query the BM25 object for the N best matches
Arguments
querythe term to search for, note all preprocessing that was applied to the text corpus initially needs to be already performed on the term, e.g., tolower, removing stopwords etc
max_nthe maximum number of results to return, default is all
return_textwhether to return the text, default is TRUE
return_metadatawhether to return metadata, default is TRUE
Examples
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
bm25$query("orange", max_n = 3)
bm25$query("orange", return_text = FALSE, return_metadata = FALSE)
bm25$query("orange", max_n = 3)Examples
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
bm25$query("orange")
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
#> 4 4 0.0000000 3 The squirrel buried the brown nut. file 4
## ------------------------------------------------
## Method `BM25$new`
## ------------------------------------------------
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25
#> <BM25 (k1: 1.20, b: 0.75)> with 4 documents (language: 'English')
#> - Data & Metadata
#> text metadata.src
#> 1 The rabbit munched the orange carrot. file 1
#> 2 The snake hugged the green lizard. file 2
#> 3 The hedgehog impaled the orange orange. file 3
#> 4 The squirrel buried the brown nut. file 4
bm25$get_data()
#> text src
#> 1 The rabbit munched the orange carrot. file 1
#> 2 The snake hugged the green lizard. file 2
#> 3 The hedgehog impaled the orange orange. file 3
#> 4 The squirrel buried the brown nut. file 4
bm25$query("orange", max_n = 2)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
bm25$query("orange", max_n = 3)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
bm25$query("orange") # return all, same as max_n = Inf or NULL
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
#> 4 4 0.0000000 3 The squirrel buried the brown nut. file 4
## ------------------------------------------------
## Method `BM25$available_languages`
## ------------------------------------------------
BM25$new()$available_languages()
#> ar da nl en fr de
#> "arabic" "danish" "dutch" "english" "french" "german"
#> el hu it no pt ro
#> "greek" "hungarian" "italian" "norwegian" "portuguese" "romanian"
#> ru es sv ta tr auto
#> "russian" "spanish" "swedish" "tamil" "turkish" "detect"
## ------------------------------------------------
## Method `BM25$get_data`
## ------------------------------------------------
BM25$new(data = letters, metadata = LETTERS)$get_data()
#> text metadata
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> 6 f F
#> 7 g G
#> 8 h H
#> 9 i I
#> 10 j J
#> 11 k K
#> 12 l L
#> 13 m M
#> 14 n N
#> 15 o O
#> 16 p P
#> 17 q Q
#> 18 r R
#> 19 s S
#> 20 t T
#> 21 u U
#> 22 v V
#> 23 w W
#> 24 x X
#> 25 y Y
#> 26 z Z
## ------------------------------------------------
## Method `BM25$get_lang`
## ------------------------------------------------
BM25$new()$get_lang()
#> [1] "Detect"
BM25$new(lang = "en")$get_lang()
#> [1] "English"
BM25$new(lang = "detect")$get_lang()
#> [1] "Detect"
## ------------------------------------------------
## Method `BM25$print`
## ------------------------------------------------
BM25$new(data = letters, metadata = LETTERS)
#> <BM25 (k1: 1.20, b: 0.75)> with 26 documents (language: 'Detect')
#> - Data & Metadata
#> text metadata.metadata
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> ... ommited 6 entries (total 26)
## ------------------------------------------------
## Method `BM25$add_data`
## ------------------------------------------------
bm25 <- BM25$new()
bm25$add_data(letters, metadata = LETTERS)
bm25
#> <BM25 (k1: 1.20, b: 0.75)> with 26 documents (language: 'Detect')
#> - Data & Metadata
#> text metadata.metadata
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> ... ommited 6 entries (total 26)
## ------------------------------------------------
## Method `BM25$query`
## ------------------------------------------------
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
bm25$query("orange", max_n = 3)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
bm25$query("orange", return_text = FALSE, return_metadata = FALSE)
#> id score rank
#> 1 3 0.4904281 1
#> 2 1 0.3566750 2
#> 3 2 0.0000000 3
#> 4 4 0.0000000 3
bm25$query("orange", max_n = 3)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2