Class to construct the BM25 search object
Methods
Method new()
Creates a new instance of a BM25 class
Usage
BM25$new(data = NULL, lang = "detect", k1 = 1.2, b = 0.75, metadata = NULL)
Arguments
data
text data, a vector of strings. Note any preprocessing steps (tolower, removing stopwords etc) need to have taken place before this!
lang
language of the data, see self$available_languages(), can also be "detect" to automatically detect the language, default is "detect"
k1
k1 parameter of BM25, default is 1.2
b
b parameter of BM25, default is 0.75
metadata
a data.frame with metadata for each document, default is NULL must be a data.frame with the same number of rows containing arbitrary metadata for each document, e.g. a file path or a URL
Examples
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25
bm25$get_data()
bm25$query("orange", max_n = 2)
bm25$query("orange", max_n = 3)
bm25$query("orange") # return all, same as max_n = Inf or NULL
Method available_languages()
Returns the available languages
Examples
BM25$new()$available_languages()
Method print()
Prints a BM25 object
Arguments
n
number of data to print, default is 5
nchar
number of characters to print for each text, default is 20
Examples
BM25$new(data = letters, metadata = LETTERS)
Method add_data()
Adds data to the BM25 object
This can be useful to add more data later on, note this will rebuild the engine.
Arguments
data
a vector of strings
metadata
a data.frame with metadata for each document, default is NULL
Examples
bm25 <- BM25$new()
bm25$add_data(letters, metadata = LETTERS)
bm25
Method query()
Query the BM25 object for the N best matches
Arguments
query
the term to search for, note all preprocessing that was applied to the text corpus initially needs to be already performed on the term, e.g., tolower, removing stopwords etc
max_n
the maximum number of results to return, default is all
return_text
whether to return the text, default is TRUE
return_metadata
whether to return metadata, default is TRUE
Examples
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
bm25$query("orange", max_n = 3)
bm25$query("orange", return_text = FALSE, return_metadata = FALSE)
bm25$query("orange", max_n = 3)
Examples
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
bm25$query("orange")
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
#> 4 4 0.0000000 3 The squirrel buried the brown nut. file 4
## ------------------------------------------------
## Method `BM25$new`
## ------------------------------------------------
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25
#> <BM25 (k1: 1.20, b: 0.75)> with 4 documents (language: 'English')
#> - Data & Metadata
#> text metadata.src
#> 1 The rabbit munched the orange carrot. file 1
#> 2 The snake hugged the green lizard. file 2
#> 3 The hedgehog impaled the orange orange. file 3
#> 4 The squirrel buried the brown nut. file 4
bm25$get_data()
#> text src
#> 1 The rabbit munched the orange carrot. file 1
#> 2 The snake hugged the green lizard. file 2
#> 3 The hedgehog impaled the orange orange. file 3
#> 4 The squirrel buried the brown nut. file 4
bm25$query("orange", max_n = 2)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
bm25$query("orange", max_n = 3)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
bm25$query("orange") # return all, same as max_n = Inf or NULL
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
#> 4 4 0.0000000 3 The squirrel buried the brown nut. file 4
## ------------------------------------------------
## Method `BM25$available_languages`
## ------------------------------------------------
BM25$new()$available_languages()
#> ar da nl en fr de
#> "arabic" "danish" "dutch" "english" "french" "german"
#> el hu it no pt ro
#> "greek" "hungarian" "italian" "norwegian" "portuguese" "romanian"
#> ru es sv ta tr auto
#> "russian" "spanish" "swedish" "tamil" "turkish" "detect"
## ------------------------------------------------
## Method `BM25$get_data`
## ------------------------------------------------
BM25$new(data = letters, metadata = LETTERS)$get_data()
#> text metadata
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> 6 f F
#> 7 g G
#> 8 h H
#> 9 i I
#> 10 j J
#> 11 k K
#> 12 l L
#> 13 m M
#> 14 n N
#> 15 o O
#> 16 p P
#> 17 q Q
#> 18 r R
#> 19 s S
#> 20 t T
#> 21 u U
#> 22 v V
#> 23 w W
#> 24 x X
#> 25 y Y
#> 26 z Z
## ------------------------------------------------
## Method `BM25$get_lang`
## ------------------------------------------------
BM25$new()$get_lang()
#> [1] "Detect"
BM25$new(lang = "en")$get_lang()
#> [1] "English"
BM25$new(lang = "detect")$get_lang()
#> [1] "Detect"
## ------------------------------------------------
## Method `BM25$print`
## ------------------------------------------------
BM25$new(data = letters, metadata = LETTERS)
#> <BM25 (k1: 1.20, b: 0.75)> with 26 documents (language: 'Detect')
#> - Data & Metadata
#> text metadata.metadata
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> ... ommited 6 entries (total 26)
## ------------------------------------------------
## Method `BM25$add_data`
## ------------------------------------------------
bm25 <- BM25$new()
bm25$add_data(letters, metadata = LETTERS)
bm25
#> <BM25 (k1: 1.20, b: 0.75)> with 26 documents (language: 'Detect')
#> - Data & Metadata
#> text metadata.metadata
#> 1 a A
#> 2 b B
#> 3 c C
#> 4 d D
#> 5 e E
#> ... ommited 6 entries (total 26)
## ------------------------------------------------
## Method `BM25$query`
## ------------------------------------------------
corpus <- c(
"The rabbit munched the orange carrot.",
"The snake hugged the green lizard.",
"The hedgehog impaled the orange orange.",
"The squirrel buried the brown nut."
)
bm25 <- BM25$new(data = corpus, lang = "en",
metadata = data.frame(src = paste("file", 1:4)))
bm25$query("orange", max_n = 2)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
bm25$query("orange", max_n = 3)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2
bm25$query("orange", return_text = FALSE, return_metadata = FALSE)
#> id score rank
#> 1 3 0.4904281 1
#> 2 1 0.3566750 2
#> 3 2 0.0000000 3
#> 4 4 0.0000000 3
bm25$query("orange", max_n = 3)
#> id score rank text src
#> 1 3 0.4904281 1 The hedgehog impaled the orange orange. file 3
#> 2 1 0.3566750 2 The rabbit munched the orange carrot. file 1
#> 3 2 0.0000000 3 The snake hugged the green lizard. file 2