all repos — litestore @ 98fcee4252b6551c3b9fb536cdfc0a17854ac87a

A minimalist nosql document store.

Changed search ranking function to okapi_bm25f_kb.
h3rald h3rald@h3rald.com
Sun, 05 Apr 2015 19:41:05 +0200
commit

98fcee4252b6551c3b9fb536cdfc0a17854ac87a

parent

75f584a8174742f7450c0fde20896a0427e1c064

2 files changed, 56 insertions(+), 25 deletions(-)

jump to
M lib/core.nimlib/core.nim

@@ -45,7 +45,7 @@ raise newException(EDatastoreDoesNotExist, "Datastore '$1' does not exists." % file)

try: result.db = db.open(file, "", "", "") # Register custom function - discard result.db.create_function("rank", -1, SQLITE_ANY, cast[pointer](SQLITE_DETERMINISTIC), okapi_bm25, nil, nil) + discard result.db.create_function("rank", -1, SQLITE_ANY, cast[pointer](SQLITE_DETERMINISTIC), okapi_bm25f_kb, nil, nil) result.path = file result.mount = "" except:
M lib/utils.nimlib/utils.nim

@@ -24,7 +24,7 @@ result = "SELECT "

if options.search.len > 0: if options.select[0] != "COUNT(id)": options.select.add("snippet(searchcontents) AS highlight") - options.select.add("rank(matchinfo(searchcontents, 'pcxnal'), 1.20, 0.75) AS rank") + options.select.add("rank(matchinfo(searchcontents, 'pcxnal'), 1.20, 0.75, 5.0, 0.5) AS rank") options.orderby = "rank DESC" result = result & options.select.join(", ") result = result & " FROM documents, searchcontents "

@@ -119,29 +119,60 @@

proc resDocumentNotFound*(id): Response = resError(Http404, "Document '$1' not found." % id) -proc okapi_bm25*(pCtx: Pcontext, nVal: int32, apVal: PValueArg) {.cdecl.} = - var firstElement = value_blob(apVal[0]) - var matchinfo = cast[ptr uarray[int32]](firstElement) - var searchTextCol = value_int(apVal[1]) - var K1 = if nVal >= 3: value_double(apVal[2]) else: 1.2 - var B = if nVal >= 4: value_double(apVal[3]) else: 0.75 - var P_OFFSET = 0 - var C_OFFSET = 1 - var X_OFFSET = 2 - var termCount = matchinfo[P_OFFSET].int32 - var colCount = matchinfo[C_OFFSET].int32 - var N_OFFSET = X_OFFSET + 3*termCount*colCount - var A_OFFSET = N_OFFSET + 1 - var L_OFFSET = A_OFFSET + colCount - var totalDocs = matchinfo[N_OFFSET].float - var avgLength = matchinfo[A_OFFSET + searchTextCol].float - var docLength = matchinfo[L_OFFSET + searchTextCol].float +# Created by Joshua Wilson on 27/05/14. +# Copyright (c) 2014 Joshua Wilson. All rights reserved. +# https://github.com/neozenith/sqlite-okapi-bm25 +# +# This is an extension to the work of "Radford 'rads' Smith" +# found at: https://github.com/rads/sqlite-okapi-bm25 +# which is covered by the MIT License +# http://opensource.org/licenses/MIT +# the following code shall also be covered by the same MIT License +proc okapi_bm25f_kb*(pCtx: Pcontext, nVal: int32, apVal: PValueArg) {.cdecl.} = + var matchinfo = cast[ptr uarray[int32]](value_blob(apVal[0])) + # Setting the default values and ignoring argument based inputs so the extra + # arguments can be the column weights instead. + if nVal < 2: + pCtx.result_error("wrong number of arguments to function okapi_bm25_kb(), expected k1 parameter", -1) + if nVal < 3: + pCtx.result_error("wrong number of arguments to function okapi_bm25_kb(), expected b parameter", -1); + let K1 = value_double(apVal[1]) # 1.2 + let B = value_double(apVal[2]) # 0.75 + # For a good explanation fo the maths and how to choose these variables + # http://stackoverflow.com/a/23161886/622276 + # NOTE: the rearranged order of parameters to match the order presented on + # SQLite3 FTS3 documentation 'pcxnals' (http://www.sqlite.org/fts3.html#matchinfo) + let P_OFFSET = 0 + let C_OFFSET = 1 + let X_OFFSET = 2 + let termCount = matchinfo[P_OFFSET].int32 + let colCount = matchinfo[C_OFFSET].int32 + let N_OFFSET = X_OFFSET + 3*termCount*colCount + let A_OFFSET = N_OFFSET + 1 + let L_OFFSET = A_OFFSET + colCount + let totalDocs = matchinfo[N_OFFSET].float + var avgLength:float = 0.0 + var docLength:float = 0.0 + for col in 0..colCount-1: + avgLength = avgLength + matchinfo[A_OFFSET + col].float + docLength = docLength + matchinfo[L_OFFSET + col].float + var epsilon = 1.0 / (totalDocs*avgLength) var sum = 0.0; for i in 0..termCount-1: - var currentX = X_OFFSET + (3 * searchTextCol * (i + 1)) - var termFrequency = matchinfo[currentX].float - var docsWithTerm = matchinfo[currentX + 2].float - var idf: float = ln((totalDocs - docsWithTerm + 0.5) / (docsWithTerm + 0.5)) - var rightSide: float = (termFrequency * (K1 + 1)) / (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength))))) - sum = sum + (idf * rightSide) + for col in 0..colCount-1: + let currentX = X_OFFSET + (3 * col * (i + 1)) + let termFrequency = matchinfo[currentX].float + let docsWithTerm = matchinfo[currentX + 2].float + var idf: float = ln((totalDocs - docsWithTerm + 0.5) / (docsWithTerm + 0.5)) + # "...terms appearing in more than half of the corpus will provide negative contributions to the final document score." + # http://en.wikipedia.org/wiki/Okapi_BM25 + idf = if idf < 0: epsilon else: idf + var rightSide: float = (termFrequency * (K1 + 1)) / (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength))))) + rightSide = rightSide+1.0 + # To comply with BM25+ that solves a lower bounding issue where large documents that match are unfairly scored as + # having similar relevancy as short documents that do not contain as many terms + # Yuanhua Lv and ChengXiang Zhai. 'Lower-bounding term frequency normalization.' In Proceedings of CIKM'2011, pages 7-16. + # http://sifaka.cs.uiuc.edu/~ylv2/pub/cikm11-lowerbound.pdf + let weight:float = if nVal > col+3: value_double(apVal[col+3]) else: 1.0 + sum = sum + (idf * rightSide) * weight pCtx.result_double(sum)