我尝试为语音识别应用程序实现DTW算法,我已经成功地做到了这一点,现在我正在尝试通过剪枝来提高DTW算法的性能。我试图对这个算法进行改进,并发现我应该以某种方式在特定范围内计算2D数组DTW中的值,如Image #1所示,但我似乎不知道如何做到这一点。有人能帮上忙吗?算法的代码包括在内(C#)

/// <summary>
/// Calculates the distance between two audio frames in the form of two MFCCFrame objects as input parameters
/// returns the difference in a double
/// </summary>
double distance(MFCCFrame frame1, MFCCFrame frame2)
{
double tempSum = 0;
for (int i = 0; i < 13; i++)
tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
return Math.Sqrt(tempSum);
}
/// <summary>
/// DTW Algorithms
/// Takes input 2 sequences: seq1 and seq2 to calculate the shortest distance between them
/// Uses the function "distance" defined above to calculate distance between two frames
/// 2D array -> DTW[,] with dimentions: number of frames in seq 1, number of frames in seq2
/// returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
double DTWDistance(Sequence seq1, Sequence seq2)
{
int m = seq1.Frames.Length, n = seq2.Frames.Length;
double[,] DTW = new double[m, n];
DTW[0, 0] = 0;
for (int i = 0; i < m; i++)
{
for (int j = 0; j < n; j++)
{
double cost = distance(seq1.Frames[i], seq2.Frames[j]);
if (i == 0 && j == 0)
DTW[i, j] = cost;
else if (i == 0)
DTW[i, j] = cost + DTW[i, j - 1];
else if (j == 0)
DTW[i, j] = cost + DTW[i - 1, j];
else
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], Math.Min(DTW[i, j - 1], DTW[i - 1, j - 1])));
}
}
}发布于 2015-12-27 15:01:32
由于没有人回答,我想我应该回答这个问题,以防有人需要帮助--这也是常规的DTW算法:
/// <summary>
/// DTW Algorithms
/// Takes input 2 sequences: input and template to calculate the shortest distance between them
/// Uses the function "distance" defined above to calculate distance between two frames
/// 2D array -> DTW[,] with dimentions: number of frames in input, number of frames in template
/// returns the last element in the 2D array, which is the difference between the two sequences
/// </summary>
///
double DTWDistance(Sequence input, Sequence template)
{
int rows = input.Frames.Length, columns = template.Frames.Length;
if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
{
return double.MaxValue;
}
double[,] DTW = new double[rows, columns];
DTW[0, 0] = 0;
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < columns; j++)
{
double cost = distance(input.Frames[i], template.Frames[j]);
if (i == 0 && j == 0)
DTW[i, j] = cost;
else if (i == 0)
DTW[i, j] = cost + DTW[i, j - 1];
else if (j == 0)
DTW[i, j] = cost + DTW[i - 1, j];
else
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
}
}
return DTW[rows - 1, columns - 1];
}我还在常规DTW算法上实现了剪枝策略,如下所示:
/// <summary>
/// DTW Algotithm with Pruning
/// Only Sakoe-Chiba band is caluculated and the rest is pruned
/// </summary>
float Pruning_DTWDistance(Sequence input, Sequence output)
{
int rows = input.Frames.Length, columns = output.Frames.Length;
if (rows < (double)(columns / 2) || columns < (double)(rows / 2))
{
return float.MaxValue;
}
float cost;
float[,] DTW = new float[rows + 1, columns + 1];
int w = Math.Abs(columns - rows);// window length -> |rows - columns|<= w
for (int i = 1; i <= rows; i++)
{
for (int j = Math.Max(1, i - w); j <= Math.Min(columns, i + w); j++)
{
if (DTW[i - 1, j] == 0)
DTW[i - 1, j] = float.MaxValue;
if (DTW[i - 1, j - 1] == 0)
DTW[i - 1, j - 1] = float.MaxValue;
DTW[0, 0] = 0;
cost = distance(input.Frames[i - 1], output.Frames[j - 1]);// frames 0 based
DTW[i, j] = (cost + Math.Min(DTW[i - 1, j], DTW[i - 1, j - 1]));// insert ,match
}
}
return DTW[rows, columns];
}这两个函数都使用帮助函数distance,定义如下:
/// <summary>
/// Calculates the distance between two audio frames in the form of MFCCFrame objects
/// returns the difference in a float
/// </summary>
///
float distance(MFCCFrame frame1, MFCCFrame frame2)
{
double tempSum = 0;
for (int i = 0; i < 13; i++)
tempSum += Math.Pow(Math.Abs(frame1.Features[i] - frame2.Features[i]), 2);
return (float)Math.Sqrt(tempSum);
}为了提高DTW algoritm的内存消耗,我只使用了两个数组而不是一个2D矩阵,此更改后的算法如下所示:
double DTW_improved(Sequence input, Sequence template)
{
// Don't compare two sequences if one of their lengths is half the other's
if (input.Frames.Length <= (0.5 * template.Frames.Length) || template.Frames.Length <= (0.5 * input.Frames.Length))
return double.PositiveInfinity;
int rows = template.Frames.Length, columns = input.Frames.Length;
double[] c1 = new double[rows];
double[] c2 = new double[rows];
double[] temp; // To hold address only (use it in swapping address)
c1[0] = distance(input.Frames[0], template.Frames[0]);
for (int i = 1; i < rows; i++)
c1[i] = c1[i - 1] + distance(input.Frames[0], template.Frames[i]);
for (int i = 1; i < columns; i++)
{
c2[0] = distance(input.Frames[i], template.Frames[0]) + c1[0];
c2[1] = distance(input.Frames[i], template.Frames[1]) + Math.Min(c1[0], c1[1]);
// Calculating first 2 elements of the array before the loop
//since they both need special conditions
for (int j = 2; j < rows; j++)
c2[j] = Math.Min(c1[j], Math.Min(c1[j - 1], c1[j - 2])) + distance(input.Frames[i], template.Frames[j]);
if (i != columns - 1) // Swapping addresses of c1 & c2
{
temp = c2;
c2 = c1;
c1 = temp;
}
}
return c2[rows - 1] / (0.5 * (input.Frames.Length + template.Frames.Length)); // Normalization: Dividing edit distance by average of input length & template length
}https://stackoverflow.com/questions/34337005
复制相似问题