我试图在JS中创建一个damerau-levenshtein距离函数。
我已经在WIkipedia上找到了算法的描述,但它们不是实现。上面写着:
为了设计一个适当的算法来计算无限制的Damerau-Levenshtein距离,注意到总是有一个最优的编辑操作序列,其中一次转置的字母以后永远不会被修改。因此,我们只需要考虑两种对称的修改子字符串的方法:(1)转置字母并在它们之间插入任意数目的字符,或者(2)删除字符序列和删除后相邻的转置字母。这种思想的直接实现给出了一个三次复杂度的算法:O_左侧(M _cdot_N \max(M,N) \右),其中M和N是字符串长度。利用Lowrance和Wagner的思想,在最坏的情况下,将该朴素算法改进为O_ be (M _cdot N_右)。很有趣的是,比特算法可以被修改为处理换位。有关这种改编的示例,请参阅信息检索部分of1。 距离
第1节指出了http://acl.ldc.upenn.edu/P/P00/P00-1037.pdf,这对我来说更加复杂。
如果我正确地理解了这一点,那么创建一个实现就不那么容易了。
下面是我目前使用的levenshtein实现:
levenshtein=function (s1, s2) {
// discuss at: http://phpjs.org/functions/levenshtein/
// original by: Carlos R. L. Rodrigues (http://www.jsfromhell.com)
// bugfixed by: Onno Marsman
// revised by: Andrea Giammarchi (http://webreflection.blogspot.com)
// reimplemented by: Brett Zamir (http://brett-zamir.me)
// reimplemented by: Alexander M Beedie
// example 1: levenshtein('Kevin van Zonneveld', 'Kevin van Sommeveld');
// returns 1: 3
if (s1 == s2) {
return 0;
}
var s1_len = s1.length;
var s2_len = s2.length;
if (s1_len === 0) {
return s2_len;
}
if (s2_len === 0) {
return s1_len;
}
// BEGIN STATIC
var split = false;
try {
split = !('0')[0];
} catch (e) {
// Earlier IE may not support access by string index
split = true;
}
// END STATIC
if (split) {
s1 = s1.split('');
s2 = s2.split('');
}
var v0 = new Array(s1_len + 1);
var v1 = new Array(s1_len + 1);
var s1_idx = 0,
s2_idx = 0,
cost = 0;
for (s1_idx = 0; s1_idx < s1_len + 1; s1_idx++) {
v0[s1_idx] = s1_idx;
}
var char_s1 = '',
char_s2 = '';
for (s2_idx = 1; s2_idx <= s2_len; s2_idx++) {
v1[0] = s2_idx;
char_s2 = s2[s2_idx - 1];
for (s1_idx = 0; s1_idx < s1_len; s1_idx++) {
char_s1 = s1[s1_idx];
cost = (char_s1 == char_s2) ? 0 : 1;
var m_min = v0[s1_idx + 1] + 1;
var b = v1[s1_idx] + 1;
var c = v0[s1_idx] + cost;
if (b < m_min) {
m_min = b;
}
if (c < m_min) {
m_min = c;
}
v1[s1_idx + 1] = m_min;
}
var v_tmp = v0;
v0 = v1;
v1 = v_tmp;
}
return v0[s1_len];
} 你对建立这样一个算法的想法是什么?如果你认为它太复杂了,我能做些什么来使'l‘(L小写)和'I’(我大写)没有区别。
发布于 2014-03-10 21:31:17
gist @doukremt给出:https://gist.github.com/doukremt/9473228
在Javascript中给出以下内容。
您可以更改微调对象中操作的权重。
var levenshteinWeighted= function(seq1,seq2)
{
var len1=seq1.length;
var len2=seq2.length;
var i, j;
var dist;
var ic, dc, rc;
var last, old, column;
var weighter={
insert:function(c) { return 1.; },
delete:function(c) { return 0.5; },
replace:function(c, d) { return 0.3; }
};
/* don't swap the sequences, or this is gonna be painful */
if (len1 == 0 || len2 == 0) {
dist = 0;
while (len1)
dist += weighter.delete(seq1[--len1]);
while (len2)
dist += weighter.insert(seq2[--len2]);
return dist;
}
column = []; // malloc((len2 + 1) * sizeof(double));
//if (!column) return -1;
column[0] = 0;
for (j = 1; j <= len2; ++j)
column[j] = column[j - 1] + weighter.insert(seq2[j - 1]);
for (i = 1; i <= len1; ++i) {
last = column[0]; /* m[i-1][0] */
column[0] += weighter.delete(seq1[i - 1]); /* m[i][0] */
for (j = 1; j <= len2; ++j) {
old = column[j];
if (seq1[i - 1] == seq2[j - 1]) {
column[j] = last; /* m[i-1][j-1] */
} else {
ic = column[j - 1] + weighter.insert(seq2[j - 1]); /* m[i][j-1] */
dc = column[j] + weighter.delete(seq1[i - 1]); /* m[i-1][j] */
rc = last + weighter.replace(seq1[i - 1], seq2[j - 1]); /* m[i-1][j-1] */
column[j] = ic < dc ? ic : (dc < rc ? dc : rc);
}
last = old;
}
}
dist = column[len2];
return dist;
}发布于 2021-11-09 20:44:27
从这里偷来的,带有格式设置和一些如何使用它的示例:
function DamerauLevenshtein(prices, damerau) {
//'prices' customisation of the edit costs by passing an object with optional 'insert', 'remove', 'substitute', and
//'transpose' keys, corresponding to either a constant number, or a function that returns the cost. The default cost
//for each operation is 1. The price functions take relevant character(s) as arguments, should return numbers, and
//have the following form:
//
//insert: function (inserted) { return NUMBER; }
//
//remove: function (removed) { return NUMBER; }
//
//substitute: function (from, to) { return NUMBER; }
//
//transpose: function (backward, forward) { return NUMBER; }
//
//The damerau flag allows us to turn off transposition and only do plain Levenshtein distance.
if (damerau !== false) {
damerau = true;
}
if (!prices) {
prices = {};
}
let insert, remove, substitute, transpose;
switch (typeof prices.insert) {
case 'function':
insert = prices.insert;
break;
case 'number':
insert = function (c) {
return prices.insert;
};
break;
default:
insert = function (c) {
return 1;
};
break;
}
switch (typeof prices.remove) {
case 'function':
remove = prices.remove;
break;
case 'number':
remove = function (c) {
return prices.remove;
};
break;
default:
remove = function (c) {
return 1;
};
break;
}
switch (typeof prices.substitute) {
case 'function':
substitute = prices.substitute;
break;
case 'number':
substitute = function (from, to) {
return prices.substitute;
};
break;
default:
substitute = function (from, to) {
return 1;
};
break;
}
switch (typeof prices.transpose) {
case 'function':
transpose = prices.transpose;
break;
case 'number':
transpose = function (backward, forward) {
return prices.transpose;
};
break;
default:
transpose = function (backward, forward) {
return 1;
};
break;
}
function distance(down, across) {
//http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
let ds = [];
if (down === across) {
return 0;
} else {
down = down.split('');
down.unshift(null);
across = across.split('');
across.unshift(null);
down.forEach(function (d, i) {
if (!ds[i]) {
ds[i] = [];
}
across.forEach(function (a, j) {
if (i === 0 && j === 0) {
ds[i][j] = 0;
} else if (i === 0) {
//Empty down (i == 0) -> across[1..j] by inserting
ds[i][j] = ds[i][j - 1] + insert(a);
} else if (j === 0) {
//Down -> empty across (j == 0) by deleting
ds[i][j] = ds[i - 1][j] + remove(d);
} else {
//Find the least costly operation that turns the prefix down[1..i] into the prefix across[1..j] using
//already calculated costs for getting to shorter matches.
ds[i][j] = Math.min(
//Cost of editing down[1..i-1] to across[1..j] plus cost of deleting
//down[i] to get to down[1..i-1].
ds[i - 1][j] + remove(d),
//Cost of editing down[1..i] to across[1..j-1] plus cost of inserting across[j] to get to across[1..j].
ds[i][j - 1] + insert(a),
//Cost of editing down[1..i-1] to across[1..j-1] plus cost of substituting down[i] (d) with across[j]
//(a) to get to across[1..j].
ds[i - 1][j - 1] + (d === a ? 0 : substitute(d, a))
);
//Can we match the last two letters of down with across by transposing them? Cost of getting from
//down[i-2] to across[j-2] plus cost of moving down[i-1] forward and down[i] backward to match
//across[j-1..j].
if (damerau && i > 1 && j > 1 && down[i - 1] === a && d === across[j - 1]) {
ds[i][j] = Math.min(ds[i][j], ds[i - 2][j - 2] + (d === a ? 0 : transpose(d, down[i - 1])));
}
}
});
});
return ds[down.length - 1][across.length - 1];
}
}
return distance;
}
//Returns a distance function to call.
let dl = DamerauLevenshtein();
console.log(dl('12345', '23451'));
console.log(dl('this is a test', 'this is not a test'));
console.log(dl('testing testing 123', 'test'));https://stackoverflow.com/questions/22308014
复制相似问题