我尝试使用xpdf源代码到MFC应用程序中,将pdf转换成文本。代码示例取自它们的站点(或存储库):
int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
GString* ownerPW, *userPW;
UnicodeMap* uMap;
TextOutputDev* textOut;
TextOutputControl textOutControl;
GString* textFileName;
int exitCode;
char textEncName[128] = "";
char textEOL[16] = "";
GBool noPageBreaks = gFalse;
GBool quiet = gFalse;
char ownerPassword[33] = "\001";
char userPassword[33] = "\001";
int firstPage = 1;
int lastPage = 0;
GBool tableLayout = gFalse;
double fixedPitch = 0;
GBool physLayout = gFalse;
GBool simpleLayout = gFalse;
GBool simple2Layout = gFalse;
GBool linePrinter = gFalse;
GBool rawOrder = gFalse;
double fixedLineSpacing = 0;
double marginLeft = 0;
double marginRight = 0;
double marginTop = 0;
double marginBottom = 0;
GBool clipText = gFalse;
GBool discardDiag = gFalse;
GBool insertBOM = gFalse;
exitCode = 99;
// read config file
globalParams = new GlobalParams("");
if (textEncName[0])
{
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0])
{
if (!globalParams->setTextEOL(textEOL))
{
fprintf(stderr, "Bad '-eol' value on command line\n");
}
}
if (noPageBreaks)
{
globalParams->setTextPageBreaks(gFalse);
}
if (quiet)
{
globalParams->setErrQuiet(quiet);
}
// Set UNICODE support
globalParams->setTextEncoding("UTF-8");
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding()))
{
error(errConfig, -1, "Couldn't get text encoding");
goto err1;
}
// open PDF file
if (ownerPassword[0] != '\001')
{
ownerPW = new GString(ownerPassword);
}
else
{
ownerPW = NULL;
}
if (userPassword[0] != '\001')
{
userPW = new GString(userPassword);
}
else
{
userPW = NULL;
}
PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
if (userPW)
{
delete userPW;
}
if (ownerPW)
{
delete ownerPW;
}
if (! doc->isOk())
{
exitCode = 1;
goto err2;
}
// check for copy permission
if (! doc->okToCopy())
{
error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}
// construct text file name
textFileName = new GString(TxtFile.c_str());
// get page range
if (firstPage < 1)
{
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages())
{
lastPage = doc->getNumPages();
}
// write text file
if (tableLayout)
{
textOutControl.mode = textOutTableLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (physLayout)
{
textOutControl.mode = textOutPhysLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (simpleLayout)
{
textOutControl.mode = textOutSimpleLayout;
}
else if (simple2Layout)
{
textOutControl.mode = textOutSimple2Layout;
}
else if (linePrinter)
{
textOutControl.mode = textOutLinePrinter;
textOutControl.fixedPitch = fixedPitch;
textOutControl.fixedLineSpacing = fixedLineSpacing;
}
else if (rawOrder)
{
textOutControl.mode = textOutRawOrder;
}
else
{
textOutControl.mode = textOutReadingOrder;
}
textOutControl.clipText = clipText;
textOutControl.discardDiagonalText = discardDiag;
textOutControl.insertBOM = insertBOM;
textOutControl.marginLeft = marginLeft;
textOutControl.marginRight = marginRight;
textOutControl.marginTop = marginTop;
textOutControl.marginBottom = marginBottom;
textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
if (textOut->isOk())
{
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
}
else
{
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;
exitCode = 0;
// clean up
err3:
delete textFileName;
err2:
delete doc;
// uMap->decRefCnt();
err1:
delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}到现在为止还好。但是这段代码并不是线程安全的:如果我尝试在多线程代码中运行这段代码,它会崩溃:
// TextOutputDev.cc
if (uMap->isUnicode())
{
lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre)); // <-- crash为什么?因为有一个变量globalParams,它在函数的最后一行中被删除,并且它对所有线程都很常见:
delete globalParams;它是GlobalParams.h ( xpdf代码的一部分)中的一个外部全局变量:
// xpdf/GlobalParams.h
// The global parameters object.
extern GlobalParams *globalParams;如何确保此函数线程安全?因为“问题变量”是在xpdf源代码中,而不是我的.
总之,globalParams是用xpdf代码声明的,并在我的(客户端)代码中进行了清理。
xpdf源代码可以在这里看到:https://github.com/jeroen/xpdf/blob/c2c946f517eb09cfd09d957e0f3b04d44bf6f827/src/poppler/GlobalParams.h
和
发布于 2022-09-20 10:59:17
尝试重新构造代码,如下所示。我已经将GlobalParams初始化代码移动到一个单独的函数中。在初始化期间,或者在启动调用Pdf2Txt()的线程之前,应该调用此函数(一次)。当然,不应该销毁GlobalParams实例,因为它可以被多个线程使用。它不会损害应用程序的内存,它只是一个对象,而不是很大--嗯,它包含许多int和bool成员变量,但是这些变量占用的空间不多,也占用了相当多的string*变量(我想最初是null或emtpy ),所以最多只有几KB。
bool InitGlobalParams()
{
UnicodeMap* uMap;
char textEncName[128] = "";
char textEOL[16] = "";
GBool noPageBreaks = gFalse;
GBool quiet = gFalse;
// read config file
globalParams = new GlobalParams(""); // <-- Maybe add some checking code here?
if (textEncName[0])
{
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0])
{
if (!globalParams->setTextEOL(textEOL))
{
fprintf(stderr, "Bad '-eol' value on command line\n");
}
}
if (noPageBreaks)
{
globalParams->setTextPageBreaks(gFalse);
}
if (quiet)
{
globalParams->setErrQuiet(quiet);
}
// Set UNICODE support
globalParams->setTextEncoding("UTF-8");
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding()))
{
error(errConfig, -1, "Couldn't get text encoding");
return false;
}
return true;
}
int Pdf2Txt(std::string PdfFile, std::string TxtFile) const
{
GString* ownerPW, *userPW;
TextOutputDev* textOut;
TextOutputControl textOutControl;
GString* textFileName;
int exitCode;
char ownerPassword[33] = "\001";
char userPassword[33] = "\001";
int firstPage = 1;
int lastPage = 0;
GBool tableLayout = gFalse;
double fixedPitch = 0;
GBool physLayout = gFalse;
GBool simpleLayout = gFalse;
GBool simple2Layout = gFalse;
GBool linePrinter = gFalse;
GBool rawOrder = gFalse;
double fixedLineSpacing = 0;
double marginLeft = 0;
double marginRight = 0;
double marginTop = 0;
double marginBottom = 0;
GBool clipText = gFalse;
GBool discardDiag = gFalse;
GBool insertBOM = gFalse;
exitCode = 99;
// open PDF file
if (ownerPassword[0] != '\001')
{
ownerPW = new GString(ownerPassword);
}
else
{
ownerPW = NULL;
}
if (userPassword[0] != '\001')
{
userPW = new GString(userPassword);
}
else
{
userPW = NULL;
}
PDFDoc* doc = new PDFDoc((char*)PdfFile.c_str(), ownerPW, userPW);
if (userPW)
{
delete userPW;
}
if (ownerPW)
{
delete ownerPW;
}
if (! doc->isOk())
{
exitCode = 1;
goto err2;
}
// check for copy permission
if (! doc->okToCopy())
{
error(errNotAllowed, -1, "Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}
// construct text file name
textFileName = new GString(TxtFile.c_str());
// get page range
if (firstPage < 1)
{
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages())
{
lastPage = doc->getNumPages();
}
// write text file
if (tableLayout)
{
textOutControl.mode = textOutTableLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (physLayout)
{
textOutControl.mode = textOutPhysLayout;
textOutControl.fixedPitch = fixedPitch;
}
else if (simpleLayout)
{
textOutControl.mode = textOutSimpleLayout;
}
else if (simple2Layout)
{
textOutControl.mode = textOutSimple2Layout;
}
else if (linePrinter)
{
textOutControl.mode = textOutLinePrinter;
textOutControl.fixedPitch = fixedPitch;
textOutControl.fixedLineSpacing = fixedLineSpacing;
}
else if (rawOrder)
{
textOutControl.mode = textOutRawOrder;
}
else
{
textOutControl.mode = textOutReadingOrder;
}
textOutControl.clipText = clipText;
textOutControl.discardDiagonalText = discardDiag;
textOutControl.insertBOM = insertBOM;
textOutControl.marginLeft = marginLeft;
textOutControl.marginRight = marginRight;
textOutControl.marginTop = marginTop;
textOutControl.marginBottom = marginBottom;
textOut = new TextOutputDev(textFileName->getCString(), &textOutControl, gFalse, gTrue);
if (textOut->isOk())
{
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0, gFalse, gTrue, gFalse);
}
else
{
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;
exitCode = 0;
// clean up
err3:
delete textFileName;
err2:
delete doc;
// uMap->decRefCnt();
err1:
// Do NOT delete the one and only GlobalParams instance!!!
//delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}上面的代码可能甚至不会编译(我用文本编辑器修改了它,并没有真正测试它),所以请做可能需要的任何更改。预期xpdf函数不会修改globalParams对象(对于它们来说是“只读”),因此这段代码有很好的工作机会。顺便说一下,在#if MULTITHREADED类定义(GlobalParams.h)中有一个GlobalParams指令,它的块中包含3个互斥对象。实现代码(GlobalParams.cc)锁定互斥锁以访问GlobalParams成员,因此这可能会导致一些线程稍等片刻,尽管我不知道有多少(必须彻底检查代码,这本身就是一个小“项目”)。你可以试着测试它。
当然,上面@KJ所表达的担忧仍然适用,并行运行许多这样的线程可能会使系统过载(虽然我不确定xpdf是否使用多个线程来处理单个 PDF,请您提供帮助,它是如何配置的?),尤其是如果您在服务器上运行这个线程不允许太多并发转换--运行转换可能会导致其他进程慢下来。它还可能导致I/O瓶颈(磁盘和/或网络),因此最初尝试使用少量线程并检查它是如何扩展的。
https://stackoverflow.com/questions/73772580
复制相似问题