我已经构建了一个使用Postgres的搜索引擎,它运行得很好。我为我所支持的主要语言使用了拼写词典,这就是我如何设置它们的方法:
CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;
ALTER TEXT SEARCH CONFIGURATION english_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
english_hunspell,
english_stem;
CREATE TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell (
COPY = portuguese_brazil_hunspell
);
ALTER TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
portuguese_brazil_hunspell,
portuguese_stem;
CREATE TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell (
COPY = spanish_hunspell
);
ALTER TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
spanish_hunspell,
spanish_stem;
CREATE TEXT SEARCH CONFIGURATION italian_unaccent_hunspell (
COPY = italian_hunspell
);
ALTER TEXT SEARCH CONFIGURATION italian_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
italian_hunspell,
italian_stem;
CREATE TEXT SEARCH CONFIGURATION russian_unaccent_hunspell (
COPY = russian_hunspell
);
ALTER TEXT SEARCH CONFIGURATION russian_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
russian_hunspell,
russian_stem;
CREATE TEXT SEARCH CONFIGURATION french_unaccent_hunspell (
COPY = french_hunspell
);
ALTER TEXT SEARCH CONFIGURATION french_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
french_hunspell,
french_stem;
CREATE TEXT SEARCH CONFIGURATION german_unaccent_hunspell (
COPY = german_hunspell
);
ALTER TEXT SEARCH CONFIGURATION german_unaccent_hunspell
ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
german_hunspell,
german_stem;
ALTER TABLE "earliest_search_indices"
ADD COLUMN "documentFts" tsvector;
ALTER TABLE "latest_search_indices"
ADD COLUMN "documentFts" tsvector;
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('english_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('english_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('english_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'english';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'portuguese';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'spanish';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('french_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('french_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('french_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'french';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('italian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'italian';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('german_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('german_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('german_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'german';
UPDATE
"earliest_search_indices"
SET
"documentFts" = (setweight(to_tsvector('russian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
"language" = 'russian';
CREATE INDEX entries_document_fts ON "earliest_search_indices" USING GIN ("documentFts");我使用的字典就在这里:
https://github.com/ericmackrodt/hunspell_dicts
这是好的,它的行为完全符合我的愿望,但也有一些问题,因为停止字消除。在大多数情况下,这是很好的工作,但也有一些例外,保持停止词将是超级相关的。下面是一些示例:
这导致搜索" Sims“作为单词" the”is eliminated.
那么,我的问题是,我怎样才能在我的字典中添加这些例外呢?比如,如果“谁”一词前面是“医生”,那就把它们索引在一起。
我不介意用手加那些例外。
提前谢谢。
发布于 2022-11-23 15:57:00
您可以通过配置所需的字典来更改停止词列表:
http://www.postgresql.org/docs/current/static/textsearch-dictionaries.html
就像这个答案一样,所以answer
https://stackoverflow.com/questions/73230343
复制相似问题