codesearch/load.sql at main · serenedb/codesearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
-- Load the fetched dataset into SereneDB and build index-covering inverted
-- indexes for the code-search app.
--
-- Paths are passed in by setup.sh as psql variables (the server reads the
-- files, so they must be absolute):
--   -v tasks=/abs/path/data/tasks.jsonl
--   -v solutions=/abs/path/data/solutions.jsonl
--
-- Every column lands in the index -- either as an index key (the analysed
-- text columns + the columns used in predicates: id, task_id, rating) or via
-- INCLUDE (the display-only columns) -- so result rows materialise straight
-- from the index without touching the base table.

\timing on

DROP INDEX IF EXISTS tasks_idx;
DROP INDEX IF EXISTS solutions_idx;
DROP TABLE IF EXISTS tasks;
DROP TABLE IF EXISTS solutions;
DROP TEXT SEARCH DICTIONARY IF EXISTS cf_en;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams;
DROP TEXT SEARCH DICTIONARY IF EXISTS code_grams_q;

-- English analyzer for statements/titles/editorials/tags (BM25 + phrases).
CREATE TEXT SEARCH DICTIONARY cf_en(
    template = 'text', locale = 'en_US.UTF-8', case = 'lower',
    stemming = true, accent = false,
    frequency = true, position = true, norm = true
);

-- Index-side sparse ngrams over code (all grams; BM25-rankable).
CREATE TEXT SEARCH DICTIONARY code_grams(
    template = 'sparse_ngram', frequency = true, norm = true
);

-- Query-side covering chain (referenced by name in ts_tokenize).
CREATE TEXT SEARCH DICTIONARY code_grams_q(
    template = 'sparse_ngram', covering = true
);

CREATE TABLE tasks (
  id           TEXT PRIMARY KEY,
  title        TEXT,
  contest_name TEXT,
  rating       INTEGER,
  tags         TEXT,
  statement    TEXT,
  editorial    TEXT
);

INSERT INTO tasks
SELECT id, title, contest_name, rating::INTEGER, tags, statement, editorial
FROM read_json_auto(:'tasks');

SELECT count(*) AS tasks FROM tasks;

CREATE TABLE solutions (
  id       INTEGER PRIMARY KEY,
  task_id  TEXT,
  title    TEXT,
  rating   INTEGER,
  code     TEXT,
  code_len INTEGER
);

INSERT INTO solutions
SELECT id::INTEGER, task_id, title, rating::INTEGER, code, length(code)
FROM read_json_auto(:'solutions');

SELECT count(*) AS solutions FROM solutions;

-- All seven task columns covered: id/rating/title/statement/editorial/tags are
-- index keys, contest_name is INCLUDEd.
CREATE INDEX tasks_idx ON tasks
USING inverted(id, rating, title cf_en, statement cf_en, editorial cf_en, tags cf_en)
INCLUDE (contest_name);

-- All columns covered: id/task_id/rating/code are keys, title/code_len INCLUDEd
-- (code_len lets "shortest first" sort on an int, not the full code column).
CREATE INDEX solutions_idx ON solutions
USING inverted(id, task_id, rating, code code_grams)
INCLUDE (title, code_len);