summaryrefslogtreecommitdiffstats
path: root/external/pdfium
diff options
context:
space:
mode:
authorAshod Nakashian <ashod.nakashian@collabora.co.uk>2018-04-08 23:38:55 -0400
committerJan Holesovsky <kendy@collabora.com>2018-06-07 10:45:21 +0200
commita262ca762a0646c68faa9fcebc0f8e898620a574 (patch)
treec1e18eab44f415c3f81a6bb27f8ffa3db67477a0 /external/pdfium
parentsvx: import PDF text using PDFium (diff)
downloadcore-a262ca762a0646c68faa9fcebc0f8e898620a574.tar.gz
core-a262ca762a0646c68faa9fcebc0f8e898620a574.zip
svx: more accurate PDF text importing
Change-Id: If37119510cbc091dc86cb5f699984186167745c7 (cherry picked from commit 7dc905d1e9b561bb71f58881190bb6f590d09d80)
Diffstat (limited to 'external/pdfium')
-rw-r--r--external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2173
-rw-r--r--external/pdfium/UnpackedTarball_pdfium.mk1
2 files changed, 174 insertions, 0 deletions
diff --git a/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2 b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
new file mode 100644
index 000000000000..ab5564a87353
--- /dev/null
+++ b/external/pdfium/0002-svx-more-accurate-PDF-text-importing.patch.2
@@ -0,0 +1,173 @@
+From 5f83d0a3fac4f8ccef457c03b74433ffd7b12e2a Mon Sep 17 00:00:00 2001
+From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
+Date: Tue, 5 Jun 2018 11:28:30 +0200
+Subject: [PATCH 02/14] svx: more accurate PDF text importing
+
+---
+ pdfium/fpdfsdk/fpdf_editpage.cpp | 84 ++++++++++++++++++++++++++++++++++++++++
+ pdfium/public/fpdf_edit.h | 36 +++++++++++++++++
+ 2 files changed, 120 insertions(+)
+
+diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
+index 912df63..3244943 100644
+--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
++++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
+@@ -12,6 +12,7 @@
+ #include <vector>
+
+ #include "core/fpdfapi/edit/cpdf_pagecontentgenerator.h"
++#include "core/fpdfapi/font/cpdf_font.h"
+ #include "core/fpdfapi/page/cpdf_form.h"
+ #include "core/fpdfapi/page/cpdf_formobject.h"
+ #include "core/fpdfapi/page/cpdf_imageobject.h"
+@@ -626,6 +627,26 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) {
+ return true;
+ }
+
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object)
++{
++ if (!text_object)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ return pTxtObj->CountChars();
++}
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object)
++{
++ if (!text_object)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ return pTxtObj->GetFontSize();
++}
++
+ FPDF_EXPORT void FPDF_CALLCONV
+ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ double* a,
+@@ -642,3 +663,66 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ *c = matrix.c;
+ *d = matrix.d;
+ }
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index)
++{
++ if (!text_object || index < 0)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ if (index > pTxtObj->CountChars())
++ return 0;
++
++ CPDF_TextObjectItem info;
++ pTxtObj->GetCharInfo(index, &info);
++ return info.m_CharCode;
++}
++
++FPDF_EXPORT int FPDF_CALLCONV FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++ int char_start,
++ int char_count,
++ unsigned short* result) {
++ if (!text_object || char_start < 0 || char_count < 0 || !result)
++ return 0;
++
++ CPDF_TextObject* pTxtObj = static_cast<CPDF_TextObject*>(text_object);
++ int char_available = pTxtObj->CountChars() - char_start;
++ if (char_available <= 0)
++ return 0;
++
++ char_count = std::min(char_count, char_available);
++ if (char_count == 0) {
++ // Writing out "", which has a character count of 1 due to the NUL.
++ *result = '\0';
++ return 1;
++ }
++
++ CPDF_Font* pFont = pTxtObj->GetFont();
++ WideString str;
++ for (uint32_t charcode : pTxtObj->GetCharCodes()) {
++ if (charcode != CPDF_Font::kInvalidCharCode)
++ str += pFont->UnicodeFromCharCode(charcode);
++ }
++
++// CFX_WideTextBuf m_TextBuf;
++// WideString str = textpage->GetPageText(char_start, char_count);
++// return WideString(m_TextBuf.AsStringView().Mid(
++// static_cast<size_t>(text_start), static_cast<size_t>(text_count)));
++
++// if (str.GetLength() > static_cast<size_t>(char_count))
++// str = str.Left(static_cast<size_t>(char_count));
++
++ // Reincode in UTF-16.
++// WideString str = text.UTF8Decode();
++
++ // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
++ // the number of items to stay the same.
++ ByteString byte_str = str.UTF16LE_Encode();
++ size_t byte_str_len = byte_str.GetLength();
++ int ret_count = byte_str_len / sizeof(unsigned short);
++
++ ASSERT(ret_count <= char_count + 1); // +1 to account for the NUL terminator.
++ memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
++ return ret_count;
++}
+diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
+index 3f45495..602849f 100644
+--- a/pdfium/public/fpdf_edit.h
++++ b/pdfium/public/fpdf_edit.h
+@@ -971,6 +971,26 @@ FPDFPageObj_CreateTextObj(FPDF_DOCUMENT document,
+ FPDF_FONT font,
+ float font_size);
+
++// Get the number of characters from a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// Return Value:
++// A character count in the text object.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object);
++
++
++// Get the font size of a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++//
++// Return Value:
++// The value of the font size
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object);
++
+ // Get the matrix of a particular text object.
+ //
+ // text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+@@ -986,6 +1006,22 @@ FPDFTextObj_GetMatrix(FPDF_PAGEOBJECT text_object,
+ double* c,
+ double* d);
+
++// Get the unicode of a special character in a text object.
++//
++// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
++// or FPDFPageObj_NewTextObjEx.
++// index - The index of the character to get the unicode.
++// Return Value:
++// The unicode value.
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetUnicode(FPDF_PAGEOBJECT text_object, int index);
++
++FPDF_EXPORT int FPDF_CALLCONV
++FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
++ int char_start,
++ int char_count,
++ unsigned short* result);
++
+ #ifdef __cplusplus
+ } // extern "C"
+ #endif // __cplusplus
+--
+2.16.3
+
diff --git a/external/pdfium/UnpackedTarball_pdfium.mk b/external/pdfium/UnpackedTarball_pdfium.mk
index 58c014f41252..22e762695300 100644
--- a/external/pdfium/UnpackedTarball_pdfium.mk
+++ b/external/pdfium/UnpackedTarball_pdfium.mk
@@ -15,6 +15,7 @@ pdfium_patches += icu.patch.1
pdfium_patches += build.patch.1
# Adds missing editing API
pdfium_patches += 0001-svx-import-PDF-text-using-PDFium.patch.2
+pdfium_patches += 0002-svx-more-accurate-PDF-text-importing.patch.2
$(eval $(call gb_UnpackedTarball_UnpackedTarball,pdfium))