code.H3RALD.com — litestore: 98fcee4252b6551c3b9fb536cdfc0a17854ac87a

Changed search ranking function to okapi_bm25f_kb.

h3rald h3rald@h3rald.com

Sun, 05 Apr 2015 19:41:05 +0200

commit

98fcee4252b6551c3b9fb536cdfc0a17854ac87a

parent

75f584a8174742f7450c0fde20896a0427e1c064

2 files changed, 56 insertions(+), 25 deletions(-)

jump to

lib/core.nim

lib/utils.nim

M lib/core.nim → lib/core.nim

@@ -45,7 +45,7 @@ raise newException(EDatastoreDoesNotExist, "Datastore '$1' does not exists." % file)
   try:
     result.db = db.open(file, "", "", "")
     # Register custom function
-    discard result.db.create_function("rank", -1, SQLITE_ANY, cast[pointer](SQLITE_DETERMINISTIC), okapi_bm25, nil, nil)
+    discard result.db.create_function("rank", -1, SQLITE_ANY, cast[pointer](SQLITE_DETERMINISTIC), okapi_bm25f_kb, nil, nil)
     result.path = file
     result.mount = ""
   except:

M lib/utils.nim → lib/utils.nim

@@ -24,7 +24,7 @@ result = "SELECT "
   if options.search.len > 0:
     if options.select[0] != "COUNT(id)":
       options.select.add("snippet(searchcontents) AS highlight")
-      options.select.add("rank(matchinfo(searchcontents, 'pcxnal'), 1.20, 0.75) AS rank")
+      options.select.add("rank(matchinfo(searchcontents, 'pcxnal'), 1.20, 0.75, 5.0, 0.5) AS rank")
       options.orderby = "rank DESC"
     result = result & options.select.join(", ")
     result = result & " FROM documents, searchcontents "
@@ -119,29 +119,60 @@ 
 proc resDocumentNotFound*(id): Response =
   resError(Http404, "Document '$1' not found." % id)
 
-proc okapi_bm25*(pCtx: Pcontext, nVal: int32, apVal: PValueArg) {.cdecl.} =
-  var firstElement = value_blob(apVal[0])
-  var matchinfo = cast[ptr uarray[int32]](firstElement)
-  var searchTextCol = value_int(apVal[1])
-  var K1 = if nVal >= 3: value_double(apVal[2]) else: 1.2
-  var B = if nVal >= 4: value_double(apVal[3]) else: 0.75
-  var P_OFFSET = 0
-  var C_OFFSET = 1
-  var X_OFFSET = 2
-  var termCount = matchinfo[P_OFFSET].int32
-  var colCount = matchinfo[C_OFFSET].int32
-  var N_OFFSET = X_OFFSET + 3*termCount*colCount
-  var A_OFFSET = N_OFFSET + 1
-  var L_OFFSET = A_OFFSET + colCount
-  var totalDocs = matchinfo[N_OFFSET].float
-  var avgLength = matchinfo[A_OFFSET + searchTextCol].float
-  var docLength = matchinfo[L_OFFSET + searchTextCol].float
+#  Created by Joshua Wilson on 27/05/14.
+#  Copyright (c) 2014 Joshua Wilson. All rights reserved.
+#  https://github.com/neozenith/sqlite-okapi-bm25
+#
+# This is an extension to the work of "Radford 'rads' Smith"
+# found at: https://github.com/rads/sqlite-okapi-bm25
+# which is covered by the MIT License
+# http://opensource.org/licenses/MIT
+# the following code shall also be covered by the same MIT License
+proc okapi_bm25f_kb*(pCtx: Pcontext, nVal: int32, apVal: PValueArg) {.cdecl.} =
+  var matchinfo = cast[ptr uarray[int32]](value_blob(apVal[0]))
+  # Setting the default values and ignoring argument based inputs so the extra
+  # arguments can be the column weights instead.
+  if nVal < 2:
+    pCtx.result_error("wrong number of arguments to function okapi_bm25_kb(), expected k1 parameter", -1)
+  if nVal < 3: 
+    pCtx.result_error("wrong number of arguments to function okapi_bm25_kb(), expected b parameter", -1);
+  let K1 = value_double(apVal[1]) # 1.2
+  let B = value_double(apVal[2])  # 0.75
+  # For a good explanation fo the maths and how to choose these variables
+  # http://stackoverflow.com/a/23161886/622276
+  # NOTE: the rearranged order of parameters to match the order presented on
+  # SQLite3 FTS3 documentation 'pcxnals' (http://www.sqlite.org/fts3.html#matchinfo)
+  let P_OFFSET = 0
+  let C_OFFSET = 1
+  let X_OFFSET = 2
+  let termCount = matchinfo[P_OFFSET].int32
+  let colCount = matchinfo[C_OFFSET].int32
+  let N_OFFSET = X_OFFSET + 3*termCount*colCount
+  let A_OFFSET = N_OFFSET + 1
+  let L_OFFSET = A_OFFSET + colCount
+  let totalDocs = matchinfo[N_OFFSET].float
+  var avgLength:float = 0.0 
+  var docLength:float = 0.0 
+  for col in 0..colCount-1:
+    avgLength = avgLength + matchinfo[A_OFFSET + col].float
+    docLength = docLength + matchinfo[L_OFFSET + col].float
+  var epsilon = 1.0 / (totalDocs*avgLength)
   var sum = 0.0;
   for i in 0..termCount-1:
-    var currentX = X_OFFSET + (3 * searchTextCol * (i + 1))
-    var termFrequency = matchinfo[currentX].float
-    var docsWithTerm = matchinfo[currentX + 2].float
-    var idf: float = ln((totalDocs - docsWithTerm + 0.5) / (docsWithTerm + 0.5))
-    var rightSide: float = (termFrequency * (K1 + 1)) / (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength)))))
-    sum = sum + (idf * rightSide)
+    for col in 0..colCount-1:
+      let currentX = X_OFFSET + (3 *  col * (i + 1))
+      let termFrequency = matchinfo[currentX].float
+      let docsWithTerm = matchinfo[currentX + 2].float
+      var idf: float = ln((totalDocs - docsWithTerm + 0.5) / (docsWithTerm + 0.5))
+      # "...terms appearing in more than half of the corpus will provide negative contributions to the final document score."
+      # http://en.wikipedia.org/wiki/Okapi_BM25
+      idf = if idf < 0: epsilon else: idf
+      var rightSide: float = (termFrequency * (K1 + 1)) / (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength)))))
+      rightSide = rightSide+1.0
+      # To comply with BM25+ that solves a lower bounding issue where large documents that match are unfairly scored as
+      # having similar relevancy as short documents that do not contain as many terms
+      # Yuanhua Lv and ChengXiang Zhai. 'Lower-bounding term frequency normalization.' In Proceedings of CIKM'2011, pages 7-16.
+      # http://sifaka.cs.uiuc.edu/~ylv2/pub/cikm11-lowerbound.pdf
+      let weight:float = if nVal > col+3: value_double(apVal[col+3]) else: 1.0
+      sum = sum + (idf * rightSide) * weight
   pCtx.result_double(sum)

all repos — litestore @ 98fcee4252b6551c3b9fb536cdfc0a17854ac87a

A minimalist nosql document store.