Implemented custom ranking function (Okapi BM25).
h3rald h3rald@h3rald.com
Tue, 31 Mar 2015 22:32:51 +0200
5 files changed,
52 insertions(+),
20 deletions(-)
M
lib/api_v1.nim
→
lib/api_v1.nim
@@ -150,7 +150,7 @@ let orig_limit = options.limit
let orig_offset = options.offset options.limit = 0 options.offset = 0 - options.select = "COUNT(id)" + options.select = @["COUNT(id)"] let total = LS.store.retrieveRawDocuments(options)[0].num if docs.len == 0: result = resError(Http404, "No documents found.")@@ -233,7 +233,7 @@ let jbody = body.parseJson
if jbody.kind != JArray: return resError(Http400, "Bad request: PATCH request body is not an array.") var options = newQueryOptions() - options.select = "id, content_type, binary, searchable, created, modified" + options.select = @["id", "content_type", "binary", "searchable", "created", "modified"] let doc = LS.store.retrieveRawDocument(id, options) if doc == "": return resDocumentNotFound(id)@@ -294,7 +294,7 @@ discard # never happens really.
proc head(req: Request, LS: LiteStore, resource: string, id = ""): Response = var options = newQueryOptions() - options.select = "id, content_type, binary, searchable, created, modified" + options.select = @["id", "content_type", "binary", "searchable", "created", "modified"] try: parseQueryOptions(req.url.query, options); if id != "":@@ -311,7 +311,7 @@ case resource:
of "docs": var options = newQueryOptions() if req.url.query.contains("contents=false"): - options.select = "id, content_type, binary, searchable, created, modified" + options.select = @["id", "content_type", "binary", "searchable", "created", "modified"] try: parseQueryOptions(req.url.query, options); if id != "":@@ -322,7 +322,7 @@ return LS.getDocument(id, options)
else: return LS.getRawDocuments(options) except: - return resError(Http400, "Bad request - $1" % getCurrentExceptionMsg()) + return resError(Http500, "Internal Server Error - $1" % getCurrentExceptionMsg()) of "info": if id != "": return resError(Http404, "Info '$1' not found." % id)
M
lib/core.nim
→
lib/core.nim
@@ -1,6 +1,6 @@
import sqlite3, - db_sqlite as db, + db_sqlite as db, strutils, os, oids,@@ -44,6 +44,8 @@ if not file.fileExists:
raise newException(EDatastoreDoesNotExist, "Datastore '$1' does not exists." % file) try: result.db = db.open(file, "", "", "") + # Register custom function + discard result.db.create_function("rank", -1, SQLITE_ANY, cast[pointer](SQLITE_DETERMINISTIC), okapi_bm25, nil, nil) result.path = file result.mount = "" except:@@ -103,7 +105,7 @@ var raw_document = store.db.getRow(select.sql, id)
if raw_document[0] == "": return "" else: - return $store.prepareJsonDocument(raw_document, options.select.split(", ")) + return $store.prepareJsonDocument(raw_document, options.select) proc createDocument*(store: Datastore, id="", rawdata = "", contenttype = "text/plain", binary = -1, searchable = 1): string = var id = id@@ -184,7 +186,7 @@ var select = prepareSelectDocumentsQuery(options)
var raw_documents = store.db.getAllRows(select.sql) var documents = newSeq[JsonNode](0) for doc in raw_documents: - documents.add store.prepareJsonDocument(doc, options.select.split(", ")) + documents.add store.prepareJsonDocument(doc, options.select) return %documents proc countDocuments*(store: Datastore): int64 =
M
lib/types.nim
→
lib/types.nim
@@ -9,12 +9,13 @@ EDirectoryNotFound* = object of Exception
EFileNotFound* = object of Exception EFileExists* = object of Exception EInvalidRequest* = object of Exception + uarray* {.unchecked.} [T] = array[0..0, T] Datastore* = object db*: TDbConn path*: string mount*: string QueryOptions* = object - select*: string + select*: seq[string] single*:bool limit*: int offset*: int@@ -72,4 +73,4 @@ proc ctJsonHeader*(): StringTableRef =
return CT_JSON.newStringTable proc newQueryOptions*(): QueryOptions = - return QueryOptions(select: "id, data, content_type, binary, searchable, created, modified", single: false, limit: 0, offset: 0, orderby: "", tags: "", search: "") + return QueryOptions(select: @["id", "data", "content_type", "binary", "searchable", "created", "modified"], single: false, limit: 0, offset: 0, orderby: "", tags: "", search: "")
M
lib/utils.nim
→
lib/utils.nim
@@ -1,4 +1,4 @@
-import json, db_sqlite, strutils, pegs, asyncdispatch, asynchttpserver2, times, logging +import json, db_sqlite, strutils, pegs, asyncdispatch, asynchttpserver2, times, logging, math, sqlite3 import types, queries, contenttypes proc dbQuote*(s: string): string =@@ -22,13 +22,15 @@
proc prepareSelectDocumentsQuery*(options: var QueryOptions): string = result = "SELECT " if options.search.len > 0: - if options.select != "COUNT(id)": - options.select = options.select & ", snippet(searchcontents) AS highlight " - result = result & options.select - result = result & "FROM documents, searchcontents " + if options.select[0] != "COUNT(id)": + options.select.add("snippet(searchcontents) AS highlight") + options.select.add("rank(matchinfo(searchcontents, 'pcxnal'), 1.20, 0.75, 1.5, 0.5) AS rank") + options.orderby = "rank DESC" + result = result & options.select.join(", ") + result = result & " FROM documents, searchcontents " result = result & "WHERE documents.id = searchcontents.document_id " else: - result = result & options.select + result = result & options.select.join(", ") result = result & " FROM documents WHERE 1=1 " if options.single: result = result & "AND id = ?"@@ -36,7 +38,7 @@ if options.tags.len > 0:
result = result & options.tags.selectDocumentsByTags() if options.search.len > 0: result = result & "AND searchcontents MATCH \"" & options.search & "\" " - if options.orderby.len > 0: + if options.orderby.len > 0 and options.select[0] != "COUNT(id)": result = result & "ORDER BY " & options.orderby & " " if options.limit > 0: result = result & "LIMIT " & $options.limit & " "@@ -70,7 +72,7 @@ for s in cols:
var key = s if s.contains(" "): let chunks = s.split(" ") - key = chunks[chunks.len-2] + key = chunks[chunks.len-1] res.add((key, %doc[count])) count.inc res.add(("tags", %tags))@@ -116,3 +118,30 @@ result.headers = ctJsonHeader()
proc resDocumentNotFound*(id): Response = resError(Http404, "Document '$1' not found." % id) + +proc okapi_bm25*(pCtx: Pcontext, nVal: int32, apVal: PValueArg) {.cdecl.} = + var firstElement = value_blob(apVal[0]) + var matchinfo = cast[ptr uarray[int32]](firstElement) + var searchTextCol = value_int(apVal[1]) + var K1 = if nVal >= 3: value_double(apVal[2]) else: 1.2 + var B = if nVal >= 4: value_double(apVal[3]) else: 0.75 + var P_OFFSET = 0 + var C_OFFSET = 1 + var X_OFFSET = 2 + var termCount = matchinfo[P_OFFSET].int32 + var colCount = matchinfo[C_OFFSET].int32 + var N_OFFSET = X_OFFSET + 3*termCount*colCount + var A_OFFSET = N_OFFSET + 1 + var L_OFFSET = A_OFFSET + colCount + var totalDocs = matchinfo[N_OFFSET].float + var avgLength = matchinfo[A_OFFSET + searchTextCol].float + var docLength = matchinfo[L_OFFSET + searchTextCol].float + var sum = 0.0; + for i in 0..termCount-1: + var currentX = X_OFFSET + (3 * searchTextCol * (i + 1)) + var termFrequency = matchinfo[currentX].float + var docsWithTerm = matchinfo[currentX + 2].float + var idf: float = ln((totalDocs - docsWithTerm + 0.5) / (docsWithTerm + 0.5)) + var rightSide: float = (termFrequency * (K1 + 1)) / (termFrequency + (K1 * (1 - B + (B * (docLength / avgLength))))) + sum = sum + (idf * rightSide) + pCtx.result_double(sum)
M
litestore.nim.cfg
→
litestore.nim.cfg
@@ -1,6 +1,6 @@
-define: "release" +#define:release dynlibOverride:sqlite3 -dynlibOverride:fts4rank +threads:off # http://crossgcc.rts-software.org/doku.php?id=compiling_for_win32 i386.windows.gcc.path = "/usr/local/gcc-4.8.0-qt-4.8.4-for-mingw32/win32-gcc/bin"