.Docx文档似乎没有被索引。
我在.docx中使用了唯一的字符串,但在搜索"one“时不返回.docx。
例如,下面的案文如下:
“这是第一行的文本,第二行的文本。”
将通过iFilter提取如下:
“这是1行的文本,这里是第2行的文本。”
因此,当Ifilter解析.docx时,他会删除断行分隔符,并尝试解析“and and here”.。
因此,.docx的ifilter一词似乎将一行的最后一个单词与下一行的第一个单词连在一起。
有人能给出一些如何绕过这个问题的想法吗?
提前谢谢。
发布于 2011-08-24 15:46:34
好了,我想出来了。基本上,64位IFilter不能正常工作。它合并由行间隔分隔的单词,而不带它们通过。我使用Ionic.zip访问docx归档文件,并使用稍微修改过的DocxToText版本解析了重要的xml文件。这件事现在运作得很好。
以下是Jevgenij Pankov最初创建的修改代码
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;
public class DocxToText
{
private const string ContentTypeNamespace =
@"http://schemas.openxmlformats.org/package/2006/content-types";
private const string WordprocessingMlNamespace =
@"http://schemas.openxmlformats.org/wordprocessingml/2006/main";
private const string DocumentXmlXPath =
"/t:Types/t:Override[@ContentType=\"" +
"application/vnd.openxmlformats-officedocument." +
"wordprocessingml.document.main+xml\"]";
private const string BodyXPath = "/w:document/w:body";
private string docxFile = "";
private string docxFileLocation = "";
public DocxToText(string fileName)
{
docxFile = fileName;
}
#region ExtractText()
///
/// Extracts text from the Docx file.
///
/// Extracted text.
public string ExtractText()
{
if (string.IsNullOrEmpty(docxFile))
throw new Exception("Input file not specified.");
// Usually it is "/word/document.xml"
docxFileLocation = FindDocumentXmlLocation();
if (string.IsNullOrEmpty(docxFileLocation))
throw new Exception("It is not a valid Docx file.");
return ReadDocumentXml();
}
#endregion
#region FindDocumentXmlLocation()
///
/// Gets location of the "document.xml" zip entry.
///
/// Location of the "document.xml".
private string FindDocumentXmlLocation()
{
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
// Find "[Content_Types].xml" zip entry
if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{
entry.Extract(stream);
stream.Position = 0;
xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}
//Create an XmlNamespaceManager for resolving namespaces
XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("t", ContentTypeNamespace);
// Find location of "document.xml"
XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
DocumentXmlXPath, nsmgr);
if (node != null)
{
string location =
((XmlElement)node).GetAttribute("PartName");
return location.TrimStart(new char[] { '/' });
}
break;
}
}
}
return null;
}
#endregion
#region ReadDocumentXml()
///
/// Reads "document.xml" zip entry.
///
/// Text containing in the document.
private string ReadDocumentXml()
{
StringBuilder sb = new StringBuilder();
using (ZipFile zip = new ZipFile(docxFile))
{
foreach (ZipEntry entry in zip)
{
if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
{
XmlDocument xmlDoc = new XmlDocument();
using (var stream = new MemoryStream())
{
entry.Extract(stream);
stream.Position = 0;
xmlDoc.PreserveWhitespace = true;
xmlDoc.Load(stream);
}
XmlNamespaceManager nsmgr =
new XmlNamespaceManager(xmlDoc.NameTable);
nsmgr.AddNamespace("w", WordprocessingMlNamespace);
XmlNode node =
xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);
if (node == null)
return string.Empty;
sb.Append(ReadNode(node));
break;
}
}
}
return sb.ToString();
}
#endregion
#region ReadNode()
///
/// Reads content of the node and its nested childs.
///
/// XmlNode.
/// Text containing in the node.
private string ReadNode(XmlNode node)
{
if (node == null || node.NodeType != XmlNodeType.Element)
return string.Empty;
StringBuilder sb = new StringBuilder();
foreach (XmlNode child in node.ChildNodes)
{
if (child.NodeType != XmlNodeType.Element) continue;
switch (child.LocalName)
{
case "t": // Text
sb.Append(child.InnerText.TrimEnd());
string space =
((XmlElement)child).GetAttribute("xml:space");
if (!string.IsNullOrEmpty(space) &&
space == "preserve")
sb.Append(' ');
break;
case "cr": // Carriage return
case "br": // Page break
sb.Append(Environment.NewLine);
break;
case "tab": // Tab
sb.Append("\t");
break;
case "p": // Paragraph
sb.Append(ReadNode(child));
sb.Append(Environment.NewLine);
sb.Append(Environment.NewLine);
break;
default:
sb.Append(ReadNode(child));
break;
}
}
return sb.ToString();
}
#endregion
}这是这个代码的用法..。
DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();发布于 2011-10-28 01:21:11
将光标放在单词的中间并保存文档将导致单词在两个XML标记之间被分割,其中有一个"_GoBack“书签。结果是,在使用此例程进行解析后,将在这两个字符串片段之间放置一个空格,而不是将它们合并回一个字符串。很容易处理"_GoBack“场景,但可能还有其他场景。也许“跟踪变化”,谁知道还会发生什么。
对于DOCX是否存在更详细的解析算法?
https://stackoverflow.com/questions/1939187
复制相似问题