文章/答案/技术大牛

发布

社区首页 >问答首页 >docx解析器错误的word ifilter

问docx解析器错误的word ifilter
EN

Stack Overflow用户

提问于 2009-12-21 09:51:04

回答 2查看 2.8K关注 0票数 4

.Docx文档似乎没有被索引。

我在.docx中使用了唯一的字符串，但在搜索"one“时不返回.docx。

例如，下面的案文如下：

“这是第一行的文本，第二行的文本。”

将通过iFilter提取如下：

“这是1行的文本，这里是第2行的文本。”

因此，当Ifilter解析.docx时，他会删除断行分隔符，并尝试解析“and and here”.。

因此，.docx的ifilter一词似乎将一行的最后一个单词与下一行的第一个单词连在一起。

有人能给出一些如何绕过这个问题的想法吗？

提前谢谢。

ifilter

回答 2

Stack Overflow用户

发布于 2011-08-24 15:46:34

好了，我想出来了。基本上，64位IFilter不能正常工作。它合并由行间隔分隔的单词，而不带它们通过。我使用Ionic.zip访问docx归档文件，并使用稍微修改过的DocxToText版本解析了重要的xml文件。这件事现在运作得很好。

以下是Jevgenij Pankov最初创建的修改代码

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Ionic.Zip;
using System.IO;
using System.Xml;

public class DocxToText
{
    private const string ContentTypeNamespace =
        @"http://schemas.openxmlformats.org/package/2006/content-types";

    private const string WordprocessingMlNamespace =
        @"http://schemas.openxmlformats.org/wordprocessingml/2006/main";

    private const string DocumentXmlXPath =
        "/t:Types/t:Override[@ContentType=\"" +
        "application/vnd.openxmlformats-officedocument." +
        "wordprocessingml.document.main+xml\"]";

    private const string BodyXPath = "/w:document/w:body";

    private string docxFile = "";
    private string docxFileLocation = "";

    public DocxToText(string fileName)
    {
        docxFile = fileName;
    }

    #region ExtractText()
    /// 

    /// Extracts text from the Docx file.

    /// 

    /// Extracted text.

    public string ExtractText()
    {
        if (string.IsNullOrEmpty(docxFile))
            throw new Exception("Input file not specified.");

        // Usually it is "/word/document.xml"


        docxFileLocation = FindDocumentXmlLocation();

        if (string.IsNullOrEmpty(docxFileLocation))
            throw new Exception("It is not a valid Docx file.");

        return ReadDocumentXml();
    }
    #endregion

    #region FindDocumentXmlLocation()
    /// 

    /// Gets location of the "document.xml" zip entry.

    /// 

    /// Location of the "document.xml".

    private string FindDocumentXmlLocation()
    {
        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                // Find "[Content_Types].xml" zip entry
                if (string.Compare(entry.FileName, "[Content_Types].xml", true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    //Create an XmlNamespaceManager for resolving namespaces


                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("t", ContentTypeNamespace);

                    // Find location of "document.xml"


                    XmlNode node = xmlDoc.DocumentElement.SelectSingleNode(
                        DocumentXmlXPath, nsmgr);

                    if (node != null)
                    {
                        string location =
                            ((XmlElement)node).GetAttribute("PartName");
                        return location.TrimStart(new char[] { '/' });
                    }
                    break;
                }
            }
        }
        return null;
    }
    #endregion

    #region ReadDocumentXml()
    /// 

    /// Reads "document.xml" zip entry.

    /// 

    /// Text containing in the document.

    private string ReadDocumentXml()
    {
        StringBuilder sb = new StringBuilder();

        using (ZipFile zip = new ZipFile(docxFile))
        {
            foreach (ZipEntry entry in zip)
            {
                if (string.Compare(entry.FileName, docxFileLocation, true) == 0)
                {
                    XmlDocument xmlDoc = new XmlDocument();
                    using (var stream = new MemoryStream())
                    {

                        entry.Extract(stream);
                        stream.Position = 0;

                        xmlDoc.PreserveWhitespace = true;
                        xmlDoc.Load(stream);
                    }

                    XmlNamespaceManager nsmgr =
                        new XmlNamespaceManager(xmlDoc.NameTable);
                    nsmgr.AddNamespace("w", WordprocessingMlNamespace);

                    XmlNode node =
                        xmlDoc.DocumentElement.SelectSingleNode(BodyXPath, nsmgr);

                    if (node == null)
                        return string.Empty;

                    sb.Append(ReadNode(node));

                    break;
                }
            }
        }
        return sb.ToString();
    }
    #endregion

    #region ReadNode()
    /// 

    /// Reads content of the node and its nested childs.

    /// 

    /// XmlNode.

    /// Text containing in the node.

    private string ReadNode(XmlNode node)
    {
        if (node == null || node.NodeType != XmlNodeType.Element)
            return string.Empty;

        StringBuilder sb = new StringBuilder();
        foreach (XmlNode child in node.ChildNodes)
        {
            if (child.NodeType != XmlNodeType.Element) continue;

            switch (child.LocalName)
            {
                case "t": // Text

                    sb.Append(child.InnerText.TrimEnd());

                    string space =
                        ((XmlElement)child).GetAttribute("xml:space");
                    if (!string.IsNullOrEmpty(space) &&
                        space == "preserve")
                        sb.Append(' ');

                    break;

                case "cr":                          // Carriage return

                case "br":                          // Page break

                    sb.Append(Environment.NewLine);
                    break;

                case "tab":                         // Tab

                    sb.Append("\t");
                    break;

                case "p":                           // Paragraph

                    sb.Append(ReadNode(child));
                    sb.Append(Environment.NewLine);
                    sb.Append(Environment.NewLine);
                    break;

                default:
                    sb.Append(ReadNode(child));
                    break;
            }
        }
        return sb.ToString();
    }
    #endregion
}

这是这个代码的用法..。

DocxToText dtt = new DocxToText(filepath);
string docxText = dtt.ExtractText();

票数 2

Stack Overflow用户

发布于 2011-10-28 01:21:11

将光标放在单词的中间并保存文档将导致单词在两个XML标记之间被分割，其中有一个"_GoBack“书签。结果是，在使用此例程进行解析后，将在这两个字符串片段之间放置一个空格，而不是将它们合并回一个字符串。很容易处理"_GoBack“场景，但可能还有其他场景。也许“跟踪变化”，谁知道还会发生什么。

对于DOCX是否存在更详细的解析算法？

票数 1

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/1939187

复制

相似问题

问docx解析器错误的word ifilter
EN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问docx解析器错误的word ifilterEN

回答 2

Stack Overflow用户

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问docx解析器错误的word ifilter
EN