文章/答案/技术大牛

发布

社区首页 >问答首页 >在Postgres in向量中包含预定义短语的停止词

问在Postgres in向量中包含预定义短语的停止词
EN

Stack Overflow用户

提问于 2022-08-04 04:21:14

回答 1查看 63关注 0票数 0

我已经构建了一个使用Postgres的搜索引擎，它运行得很好。我为我所支持的主要语言使用了拼写词典，这就是我如何设置它们的方法：

CREATE EXTENSION IF NOT EXISTS unaccent WITH SCHEMA public;

ALTER TEXT SEARCH CONFIGURATION english_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  english_hunspell,
  english_stem;

CREATE TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell (
  COPY = portuguese_brazil_hunspell
);

ALTER TEXT SEARCH CONFIGURATION portuguese_brazil_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  portuguese_brazil_hunspell,
  portuguese_stem;

CREATE TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell (
  COPY = spanish_hunspell
);

ALTER TEXT SEARCH CONFIGURATION spanish_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  spanish_hunspell,
  spanish_stem;

CREATE TEXT SEARCH CONFIGURATION italian_unaccent_hunspell (
  COPY = italian_hunspell
);

ALTER TEXT SEARCH CONFIGURATION italian_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  italian_hunspell,
  italian_stem;

CREATE TEXT SEARCH CONFIGURATION russian_unaccent_hunspell (
  COPY = russian_hunspell
);

ALTER TEXT SEARCH CONFIGURATION russian_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  russian_hunspell,
  russian_stem;

CREATE TEXT SEARCH CONFIGURATION french_unaccent_hunspell (
  COPY = french_hunspell
);

ALTER TEXT SEARCH CONFIGURATION french_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  french_hunspell,
  french_stem;

CREATE TEXT SEARCH CONFIGURATION german_unaccent_hunspell (
  COPY = german_hunspell
);

ALTER TEXT SEARCH CONFIGURATION german_unaccent_hunspell
  ALTER MAPPING FOR asciiword, asciihword, hword_asciipart, word, hword, hword_part WITH unaccent,
  german_hunspell,
  german_stem;

ALTER TABLE "earliest_search_indices"
  ADD COLUMN "documentFts" tsvector;

ALTER TABLE "latest_search_indices"
  ADD COLUMN "documentFts" tsvector;

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('english_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('english_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('english_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'english';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('portuguese_brazil_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'portuguese';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('spanish_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'spanish';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('french_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('french_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('french_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'french';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('italian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('italian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'italian';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('german_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('german_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('german_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'german';

UPDATE
  "earliest_search_indices"
SET
  "documentFts" = (setweight(to_tsvector('russian_unaccent_hunspell', coalesce(title,'')), 'A') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce("directoryDescription",'')), 'B') || setweight(to_tsvector('russian_unaccent_hunspell', coalesce(body,'')), 'C'))
WHERE
  "language" = 'russian';

CREATE INDEX entries_document_fts ON "earliest_search_indices" USING GIN ("documentFts");

我使用的字典就在这里：

https://github.com/ericmackrodt/hunspell_dicts

这是好的，它的行为完全符合我的愿望，但也有一些问题，因为停止字消除。在大多数情况下，这是很好的工作，但也有一些例外，保持停止词将是超级相关的。下面是一些示例：

这导致搜索" Sims“作为单词" the”is eliminated.

Doctor Who --这导致搜索“”作为" Who“是eliminated.
The who--这导致搜索"”既是" the“又"who”。--

那么，我的问题是，我怎样才能在我的字典中添加这些例外呢？比如，如果“谁”一词前面是“医生”，那就把它们索引在一起。

我不介意用手加那些例外。

提前谢谢。

stop-words

tsvector

postgresql

dictionary

full-text-search

回答 1

Stack Overflow用户

发布于 2022-11-23 15:57:00

您可以通过配置所需的字典来更改停止词列表：

http://www.postgresql.org/docs/current/static/textsearch-dictionaries.html

就像这个答案一样，所以answer

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/73230343

复制

相似问题

问在Postgres in向量中包含预定义短语的停止词
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问在Postgres in向量中包含预定义短语的停止词EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问在Postgres in向量中包含预定义短语的停止词
EN