From 7e8ecec81f102993e3fe73256415dcf049c09e29 Mon Sep 17 00:00:00 2001 From: Ashod Nakashian Date: Tue, 5 Jun 2018 11:35:39 +0200 Subject: [PATCH 12/14] svx: import processed PDF text --- pdfium/core/fpdftext/cpdf_textpage.cpp | 29 ++++++++++++++++++++++++ pdfium/core/fpdftext/cpdf_textpage.h | 2 ++ pdfium/fpdfsdk/fpdf_editpage.cpp | 41 ++++++++++++++++++++++++++++++++++ pdfium/public/fpdf_edit.h | 13 +++++++++++ 4 files changed, 85 insertions(+) diff --git a/pdfium/core/fpdftext/cpdf_textpage.cpp b/pdfium/core/fpdftext/cpdf_textpage.cpp index 5690698..4d7c48a 100644 --- a/pdfium/core/fpdftext/cpdf_textpage.cpp +++ b/pdfium/core/fpdftext/cpdf_textpage.cpp @@ -1464,3 +1464,32 @@ Optional CPDF_TextPage::GenerateCharInfo(wchar_t unicode) { info.m_Origin.x, info.m_Origin.y); return info; } + +WideString CPDF_TextPage::GetTextObjectText(CPDF_TextObject* pTextObj) +{ + if (!m_bIsParsed) + return WideString(); + + float posy = 0; + bool IsContainPreChar = false; + bool IsAddLineFeed = false; + WideString strText; + for (const auto& charinfo : m_CharList) { + if (charinfo.m_pTextObj == pTextObj) { + IsContainPreChar = true; + IsAddLineFeed = false; + if (charinfo.m_Unicode) + strText += charinfo.m_Unicode; + } else if (charinfo.m_Unicode == 32) { + if (IsContainPreChar && charinfo.m_Unicode) { + strText += charinfo.m_Unicode; + IsContainPreChar = false; + IsAddLineFeed = false; + } + } else { + IsContainPreChar = false; + IsAddLineFeed = true; + } + } + return strText; +} diff --git a/pdfium/core/fpdftext/cpdf_textpage.h b/pdfium/core/fpdftext/cpdf_textpage.h index 43a0312..7d5d5ec 100644 --- a/pdfium/core/fpdftext/cpdf_textpage.h +++ b/pdfium/core/fpdftext/cpdf_textpage.h @@ -105,6 +105,8 @@ class CPDF_TextPage { WideString GetPageText(int start, int count) const; WideString GetAllPageText() const { return GetPageText(0, CountChars()); } + WideString GetTextObjectText(CPDF_TextObject* pTextObj); + int CountRects(int start, int nCount); bool GetRect(int rectIndex, CFX_FloatRect* pRect) const; diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp index f4a1688..f34d3b5 100644 --- a/pdfium/fpdfsdk/fpdf_editpage.cpp +++ b/pdfium/fpdfsdk/fpdf_editpage.cpp @@ -26,6 +26,7 @@ #include "core/fpdfapi/parser/cpdf_string.h" #include "core/fpdfdoc/cpdf_annot.h" #include "core/fpdfdoc/cpdf_annotlist.h" +#include "core/fpdftext/cpdf_textpage.h" #include "fpdfsdk/cpdfsdk_helpers.h" #include "public/fpdf_formfill.h" #include "third_party/base/logging.h" @@ -656,6 +657,46 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) { return true; } +FPDF_EXPORT int FPDF_CALLCONV +FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object, + FPDF_TEXTPAGE page, + int char_start, + int char_count, + unsigned short* result) +{ + if (!page || !text_object || char_start < 0 || char_count < 0 || !result) + return 0; + + CPDF_TextObject* pTxtObj = CPDFTextObjectFromFPDFPageObject(text_object); + CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page); + int char_available = textpage->CountChars() - char_start; + if (char_available <= 0) + return 0; + + char_count = std::min(char_count, char_available); + if (char_count == 0) { + // Writing out "", which has a character count of 1 due to the NUL. + *result = '\0'; + return 1; + } + + WideString str = textpage->GetTextObjectText(pTxtObj); + + if (str.GetLength() > static_cast(char_count)) + str = str.Left(static_cast(char_count)); + + // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected + // the number of items to stay the same. + ByteString byte_str = str.UTF16LE_Encode(); + size_t byte_str_len = byte_str.GetLength(); + constexpr size_t kBytesPerCharacter = sizeof(unsigned short); + int ret_count = byte_str_len / kBytesPerCharacter; + + ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator. + memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len); + return ret_count; +} + FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV FPDFTextObj_GetColor(FPDF_PAGEOBJECT text_object, unsigned int* R, diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h index f249e64..e14b2a5 100644 --- a/pdfium/public/fpdf_edit.h +++ b/pdfium/public/fpdf_edit.h @@ -1088,6 +1088,19 @@ FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object); FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object); +// Get the processed text of a text object. +// +// text_object - Handle of text object returned by FPDFPageObj_NewTextObj +// or FPDFPageObj_NewTextObjEx. +// Return Value: +// The number of characters (not bytes) written in result. +FPDF_EXPORT int FPDF_CALLCONV +FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object, + FPDF_TEXTPAGE page, + int char_start, + int char_count, + unsigned short* result); + // Get the stroke RGBA of a text. Range of values: 0 - 255. // // path - the handle to the path object. -- 2.16.3