Skip to content

Commit 2a7d4b9

Browse files
committed
fixed the search and postings list builer #14 and #11
1 parent 1345e59 commit 2a7d4b9

File tree

4 files changed

+59
-38
lines changed

4 files changed

+59
-38
lines changed

src/data/components/input/fileInput.js

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ export const DataIngestor = (props) => {
6363
content: x[textField],
6464
id:uuid()
6565
}))
66-
debugger;
6766
db.addDocsBatch(formtedData)
6867
.then(()=>setLoading(false))
6968
;

src/data/db/dexiewDB.js

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,6 @@ export const search = async (query, params) => {
9292
candidateDocIds = await searchForTrigrams(terms);
9393
}
9494
let result
95-
debugger;
9695
switch (params.labelFilter) {
9796
case LABEL_FILTER_OPTIONS.ALL:
9897
result = dataTable.where("id").anyOf(candidateDocIds)
@@ -103,7 +102,6 @@ export const search = async (query, params) => {
103102
case LABEL_FILTER_OPTIONS.UNLABELED:
104103

105104
const idsToExclude = new Set(await dataTable.where('[has_label+id]').anyOf(candidateDocIds.map(id => [1, id])).primaryKeys())
106-
debugger;
107105
const idsToGet = candidateDocIds.filter(x => !idsToExclude.has(x))
108106
result = dataTable.where("id").anyOf(idsToGet)
109107
break

src/data/db/insertionUtils.js

Lines changed: 48 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ export const enqueTermsToBePosted = async (docs) => {
9898

9999
}
100100
export const addDocsToStore = async (docs) => {
101-
debugger;
102101
let t1 = new Date()
103102
await dataTable.bulkAdd(docs);
104103
let t2 = new Date()
@@ -116,54 +115,72 @@ export const addDocsToStore = async (docs) => {
116115
}
117116

118117
export const moveNewTermsFromPostingsQueueToPostingsTable = async (db) => {
119-
const terms = await postingsQueueTable.where("newTerm").equals(1).toArray()
120118
const step = 1000;
121-
for (let i = 0; i < terms.length; i += step) {
119+
let collection = postingsQueueTable.where("newTerm").equals(1).limit(step)
120+
let terms = await collection.toArray()
121+
debugger;
122+
123+
while (terms.length > 0) {
122124
await db.transaction('rw', [DF_SCHEMA, POSTING_QUEUE_SCHEMA, POSTINGS_SCHEMA], async tx => {
123-
const currentSet = terms.slice(i,i+step);
124-
const insertResults = await postingsTable.bulkAdd(currentSet)
125-
await dfTable.bulkAdd(currentSet.map(x=>({trigram:x.trigram,freq: x.docs.length})))
126-
await postingsQueueTable.where("trigram").anyOf(terms.map(x => x.trigram)).delete()
127-
125+
let t1 = new Date()
126+
const insertResults = await postingsTable.bulkAdd(terms)
127+
await dfTable.bulkAdd(terms.map(x => ({ trigram: x.trigram, freq: x.docs.length })))
128+
await collection.delete()
129+
collection = postingsQueueTable.where("newTerm").equals(1).limit(step)
130+
terms = await collection.toArray()
131+
let t2 = new Date()
132+
console.log(`moved ${terms.length} items from postings que`)
133+
128134
})
129135
}
130136
}
137+
const getTermsMap = (terms) => terms.reduce((map, termObj) => {
138+
map[termObj.trigram] = termObj
139+
return map
140+
}, {})
131141

142+
const appendIngLogic = async (db)=>{
143+
144+
}
132145
export const appendDocsFromQueueToExistingPostingsItems = async (db) => {
133146
//Get the terms that need to be updated
134-
const terms = await postingsQueueTable.where("newTerm").equals(0).toArray()
147+
const step = 400; //This is low because larger values give an IPC error
148+
let collection = postingsQueueTable.where("newTerm").equals(0).limit(step);
149+
150+
let terms = await collection.toArray()
151+
debugger;
135152
console.log(`Got ${terms.length} terms to update in postings table`)
136-
const termsMap = terms.reduce((map, termObj) => {
137-
map[termObj.trigram] = termObj
138-
return map
139-
}, {})
153+
let termsMap = getTermsMap(terms)
154+
155+
let termKeys = Object.keys(termsMap);
140156

141-
const termKeys = Object.keys(termsMap);
142-
const step = 1000;
143157
console.log(`Term map built`)
144-
145-
for (let i=0; i<termKeys.length;i+=step){
158+
do {
146159
let t1 = new Date()
147-
await db.transaction('rw', [DF_SCHEMA, POSTING_QUEUE_SCHEMA, POSTINGS_SCHEMA], async tx => {
148-
149-
const currentSet = termKeys.slice(i,i+step);
150-
151-
await postingsTable.where("trigram").anyOf(currentSet).modify(posting => {
160+
await db.transaction('rw', [DF_SCHEMA, POSTING_QUEUE_SCHEMA, POSTINGS_SCHEMA], async tx => {
161+
let collection = postingsQueueTable.where("newTerm").equals(0).limit(step);
162+
let terms = await collection.toArray()
163+
if (terms.length ===0){
164+
}
165+
else{
166+
167+
// Linter warns us about something potentially quite bad https://eslint.org/docs/rules/no-loop-func
168+
await postingsTable.where("trigram").anyOf(termKeys).modify(posting => {
152169
const newDocIds = termsMap[posting.trigram].docs
153170
posting.docs = posting.docs.concat(newDocIds);
154171
})
155-
await dfTable.where("trigram").anyOf(currentSet).modify(df =>{
156-
157-
df.freq = df.freq+ termsMap[df.trigram].docs.length
172+
await dfTable.where("trigram").anyOf(termKeys).modify(df => {
173+
174+
df.freq = df.freq + termsMap[df.trigram].docs.length
158175
})
159176

160-
await postingsQueueTable.where("trigram").anyOf(currentSet).delete()
177+
await collection.delete()
178+
161179

180+
}
162181
})
163182
let t2 = new Date();
164-
console.log(`Updated ${step} terms in postings table in ${t2-t1} ms`)
165-
}
166-
167-
console.log(`Postings table updated`)
168-
await postingsQueueTable.where("newTerm").equals(0).delete()
183+
console.log(`Updated ${step} terms in postings table in ${t2 - t1} ms`)
184+
} while( (await postingsQueueTable.where("newTerm").equals(0).count() ) >0)
185+
169186
}

src/data/db/queryFunctions.js

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ export const searchForTrigram = async (trigram,docIds=[])=>{
1414

1515
//If only a trigram was provided
1616
keys = await postingsTable.get(trigram)
17-
debugger;
1817
return keys.docs;
1918

2019
//Keys is an array of primary keys of the posting table. The PK is [term,docId]
@@ -52,21 +51,29 @@ export const searchForTrigrams = async (trigrams,docIds=[]) =>{
5251

5352
//We just started, sort the trigrams by df so that we query by least frequent.
5453
const previousTrigramCount = trigrams.length
55-
debugger;
5654
trigrams = await sortTrigramsByDF(trigrams)
5755
trigrams = trigrams.map(x=>x.trigram);
5856
if (trigrams.length < previousTrigramCount || trigrams.length ===0){
5957
// In this case, one or more of the trigrams was not in the index, so return [] without searching
6058
return []
6159

6260
}
61+
let first = true;
6362
do {
6463
// Keep narrowing down the list until we are out of trigrams or the list of document ids is empty (which means there is no match)
6564
const trigram = trigrams.shift();
66-
docIds = await searchForTrigram(trigram,docIds)
65+
const foundIds = await searchForTrigram(trigram,docIds)
66+
if (first){
67+
docIds = foundIds.map(x=>x.id)
68+
}else{
69+
const foundIdsSet = new Set(foundIds.map(x=>x.id));
70+
docIds = docIds.filter(id=>foundIdsSet.has(id));
71+
}
72+
first=false;
73+
6774
} while (docIds.length>0 && trigrams.length >0)
6875

69-
return docIds.map(x=>x.id);
76+
return docIds;
7077

7178

7279
}

0 commit comments

Comments
 (0)