-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathload.sql
More file actions
82 lines (69 loc) · 2.65 KB
/
Copy pathload.sql
File metadata and controls
82 lines (69 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
-- Load the fetched dataset into SereneDB and build index-covering inverted
-- indexes for the code-search app.
--
-- Paths are passed in by setup.sh as psql variables (the server reads the
-- files, so they must be absolute):
-- -v tasks=/abs/path/data/tasks.jsonl
-- -v solutions=/abs/path/data/solutions.jsonl
--
-- Every column lands in the index -- either as an index key (the analysed
-- text columns + the columns used in predicates: id, task_id, rating) or via
-- INCLUDE (the display-only columns) -- so result rows materialise straight
-- from the index without touching the base table.
\timing on
DROP INDEX IF EXISTS tasks_idx;
DROP INDEX IF EXISTS solutions_idx;
DROP TABLE IF EXISTS tasks;
DROP TABLE IF EXISTS solutions;
DROP TEXT SEARCH DICTIONARY IF EXISTS cf_en;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams_q;
-- English analyzer for statements/titles/editorials/tags (BM25 + phrases).
CREATE TEXT SEARCH DICTIONARY cf_en(
template = 'text', locale = 'en_US.UTF-8', case = 'lower',
stemming = true, accent = false,
frequency = true, position = true, norm = true
);
-- Index-side sparse ngrams over code (all grams; BM25-rankable).
CREATE TEXT SEARCH DICTIONARY code_grams(
template = 'sparse_ngram', frequency = true, norm = true
);
-- Query-side covering chain (referenced by name in ts_tokenize).
CREATE TEXT SEARCH DICTIONARY code_grams_q(
template = 'sparse_ngram', covering = true
);
CREATE TABLE tasks (
id TEXT PRIMARY KEY,
title TEXT,
contest_name TEXT,
rating INTEGER,
tags TEXT,
statement TEXT,
editorial TEXT
);
INSERT INTO tasks
SELECT id, title, contest_name, rating::INTEGER, tags, statement, editorial
FROM read_json_auto(:'tasks');
SELECT count(*) AS tasks FROM tasks;
CREATE TABLE solutions (
id INTEGER PRIMARY KEY,
task_id TEXT,
title TEXT,
rating INTEGER,
code TEXT,
code_len INTEGER
);
INSERT INTO solutions
SELECT id::INTEGER, task_id, title, rating::INTEGER, code, length(code)
FROM read_json_auto(:'solutions');
SELECT count(*) AS solutions FROM solutions;
-- All seven task columns covered: id/rating/title/statement/editorial/tags are
-- index keys, contest_name is INCLUDEd.
CREATE INDEX tasks_idx ON tasks
USING inverted(id, rating, title cf_en, statement cf_en, editorial cf_en, tags cf_en)
INCLUDE (contest_name);
-- All columns covered: id/task_id/rating/code are keys, title/code_len INCLUDEd
-- (code_len lets "shortest first" sort on an int, not the full code column).
CREATE INDEX solutions_idx ON solutions
USING inverted(id, task_id, rating, code code_grams)
INCLUDE (title, code_len);