diff options
author | Ashod Nakashian <ashod.nakashian@collabora.co.uk> | 2018-04-08 23:38:55 -0400 |
---|---|---|
committer | Jan Holesovsky <kendy@collabora.com> | 2018-06-07 10:45:21 +0200 |
commit | a262ca762a0646c68faa9fcebc0f8e898620a574 (patch) | |
tree | c1e18eab44f415c3f81a6bb27f8ffa3db67477a0 /external/pdfium | |
parent | svx: import PDF text using PDFium (diff) | |
download | core-a262ca762a0646c68faa9fcebc0f8e898620a574.tar.gz core-a262ca762a0646c68faa9fcebc0f8e898620a574.zip |
svx: more accurate PDF text importing
Change-Id: If37119510cbc091dc86cb5f699984186167745c7
(cherry picked from commit 7dc905d1e9b561bb71f58881190bb6f590d09d80)
Diffstat (limited to 'external/pdfium')
-rw-r--r-- | external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 | 173 | ||||
-rw-r--r-- | external/pdfium/UnpackedTarball_pdfium.mk | 1 |
2 files changed, 174 insertions, 0 deletions
diff --git a/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 new file mode 100644 index 000000000000..ab5564a87353 --- /dev/null +++ b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 @@ -0,0 +1,173 @@ +From 5f83d0a3fac4f8ccef457c03b74433ffd7b12e2a Mon Sep 17 00:00:00 2001 +From: Ashod Nakashian <ashod.nakashian@collabora.co.uk> +Date: Tue, 5 Jun 2018 11:28:30 +0200 +Subject: [PATCH 02/14] svx: more accurate PDF text importing + +--- + pdfium/fpdfsdk/fpdf_editpage.cpp | 84 ++++++++++++++++++++++++++++++++++++++++ + pdfium/public/fpdf_edit.h | 36 +++++++++++++++++ + 2 files changed, 120 insertions(+) + +diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp +index 912df63..3244943 100644 +--- a/pdfium/fpdfsdk/fpdf_editpage.cpp ++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp +@@ -12,6 +12,7 @@ + #include <vector> + + #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h" ++#include "core/fpdfapi/font/cpdf_font.h" + #include "core/fpdfapi/page/cpdf_form.h" + #include "core/fpdfapi/page/cpdf_formobject.h" + #include "core/fpdfapi/page/cpdf_imageobject.h" +@@ -626,6 +627,26 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) { + return true; + } + ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object) ++{ ++ if (!text_object) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ return pTxtObj->CountChars(); ++} ++ ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object) ++{ ++ if (!text_object) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ return pTxtObj->GetFontSize(); ++} ++ + FPDF_EXPORT void FPDF_CALLCONV + FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object, + double* a, +@@ -642,3 +663,66 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object, + *c = matrix.c; + *d = matrix.d; + } ++ ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index) ++{ ++ if (!text_object || index < 0) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ if (index > pTxtObj->CountChars()) ++ return 0; ++ ++ CPDF_TextObjectItem info; ++ pTxtObj->GetCharInfo(index, &info); ++ return info.m_CharCode; ++} ++ ++FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, ++ int char_start, ++ int char_count, ++ unsigned short* result) { ++ if (!text_object || char_start < 0 || char_count < 0 || !result) ++ return 0; ++ ++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object); ++ int char_available = pTxtObj->CountChars() - char_start; ++ if (char_available <= 0) ++ return 0; ++ ++ char_count = std::min(char_count, char_available); ++ if (char_count == 0) { ++ // Writing out "", which has a character count of 1 due to the NUL. ++ *result = '\0'; ++ return 1; ++ } ++ ++ CPDF_Font* pFont = pTxtObj->GetFont(); ++ WideString str; ++ for (uint32_t charcode : pTxtObj->GetCharCodes()) { ++ if (charcode != CPDF_Font::kInvalidCharCode) ++ str += pFont->UnicodeFromCharCode(charcode); ++ } ++ ++// CFX_WideTextBuf m_TextBuf; ++// WideString str = textpage->GetPageText(char_start, char_count); ++// return WideString(m_TextBuf.AsStringView().Mid( ++// static_cast<size_t>(text_start), static_cast<size_t>(text_count))); ++ ++// if (str.GetLength() > static_cast<size_t>(char_count)) ++// str = str.Left(static_cast<size_t>(char_count)); ++ ++ // Reincode in UTF-16. ++// WideString str = text.UTF8Decode(); ++ ++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected ++ // the number of items to stay the same. ++ ByteString byte_str = str.UTF16LE_Encode(); ++ size_t byte_str_len = byte_str.GetLength(); ++ int ret_count = byte_str_len / sizeof(unsigned short); ++ ++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator. ++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len); ++ return ret_count; ++} +diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h +index 3f45495..602849f 100644 +--- a/pdfium/public/fpdf_edit.h ++++ b/pdfium/public/fpdf_edit.h +@@ -971,6 +971,26 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document, + FPDF_FONT font, + float font_size); + ++// Get the number of characters from a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// Return Value: ++// A character count in the text object. ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object); ++ ++ ++// Get the font size of a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// ++// Return Value: ++// The value of the font size ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object); ++ + // Get the matrix of a particular text object. + // + // text_object - Handle of text object returned by FPDFPageObj_NewTextObj +@@ -986,6 +1006,22 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object, + double* c, + double* d); + ++// Get the unicode of a special character in a text object. ++// ++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj ++// or FPDFPageObj_NewTextObjEx. ++// index - The index of the character to get the unicode. ++// Return Value: ++// The unicode value. ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index); ++ ++FPDF_EXPORT int FPDF_CALLCONV ++FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object, ++ int char_start, ++ int char_count, ++ unsigned short* result); ++ + #ifdef __cplusplus + } // extern "C" + #endif // __cplusplus +-- +2.16.3 + diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk index 58c014f41252..22e762695300 100644 --- a/external/pdfium/UnpackedTarball_pdfium.mk +++ b/external/pdfium/UnpackedTarball_pdfium.mk @@ -15,6 +15,7 @@ pdfium_patches += icu.patch.1 pdfium_patches += build.patch.1 # Adds missing editing API pdfium_patches += 0001-svx-import-PDF-text-using-PDFium.patch.2 +pdfium_patches += 0002-svx-more-accurate-PDF-text-importing.patch.2 $(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium)) |