summaryrefslogtreecommitdiffstats
path: root/external/pdfium/0012-svx-import-processed-PDF-text.patch.2
blob: 008c7047bdfae216b6f8311ed9f362d47d8455db (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
From 7e8ecec81f102993e3fe73256415dcf049c09e29 Mon Sep 17 00:00:00 2001
From: Ashod Nakashian <ashod.nakashian@collabora.co.uk>
Date: Tue, 5 Jun 2018 11:35:39 +0200
Subject: [PATCH 12/14] svx: import processed PDF text

---
 pdfium/core/fpdftext/cpdf_textpage.cpp | 29 ++++++++++++++++++++++++
 pdfium/core/fpdftext/cpdf_textpage.h   |  2 ++
 pdfium/fpdfsdk/fpdf_editpage.cpp       | 41 ++++++++++++++++++++++++++++++++++
 pdfium/public/fpdf_edit.h              | 13 +++++++++++
 4 files changed, 85 insertions(+)

diff --git a/pdfium/core/fpdftext/cpdf_textpage.cpp b/pdfium/core/fpdftext/cpdf_textpage.cpp
index 5690698..4d7c48a 100644
--- a/pdfium/core/fpdftext/cpdf_textpage.cpp
+++ b/pdfium/core/fpdftext/cpdf_textpage.cpp
@@ -1464,3 +1464,32 @@ Optional<PAGECHAR_INFO> CPDF_TextPage::GenerateCharInfo(wchar_t unicode) {
                                  info.m_Origin.x, info.m_Origin.y);
   return info;
 }
+
+WideString CPDF_TextPage::GetTextObjectText(CPDF_TextObject* pTextObj)
+{
+  if (!m_bIsParsed)
+    return WideString();
+
+  float posy = 0;
+  bool IsContainPreChar = false;
+  bool IsAddLineFeed = false;
+  WideString strText;
+  for (const auto& charinfo : m_CharList) {
+    if (charinfo.m_pTextObj == pTextObj) {
+      IsContainPreChar = true;
+      IsAddLineFeed = false;
+      if (charinfo.m_Unicode)
+        strText += charinfo.m_Unicode;
+    } else if (charinfo.m_Unicode == 32) {
+      if (IsContainPreChar && charinfo.m_Unicode) {
+        strText += charinfo.m_Unicode;
+        IsContainPreChar = false;
+        IsAddLineFeed = false;
+      }
+    } else {
+      IsContainPreChar = false;
+      IsAddLineFeed = true;
+    }
+  }
+  return strText;
+}
diff --git a/pdfium/core/fpdftext/cpdf_textpage.h b/pdfium/core/fpdftext/cpdf_textpage.h
index 43a0312..7d5d5ec 100644
--- a/pdfium/core/fpdftext/cpdf_textpage.h
+++ b/pdfium/core/fpdftext/cpdf_textpage.h
@@ -105,6 +105,8 @@ class CPDF_TextPage {
   WideString GetPageText(int start, int count) const;
   WideString GetAllPageText() const { return GetPageText(0, CountChars()); }
 
+  WideString GetTextObjectText(CPDF_TextObject* pTextObj);
+
   int CountRects(int start, int nCount);
   bool GetRect(int rectIndex, CFX_FloatRect* pRect) const;
 
diff --git a/pdfium/fpdfsdk/fpdf_editpage.cpp b/pdfium/fpdfsdk/fpdf_editpage.cpp
index f4a1688..f34d3b5 100644
--- a/pdfium/fpdfsdk/fpdf_editpage.cpp
+++ b/pdfium/fpdfsdk/fpdf_editpage.cpp
@@ -26,6 +26,7 @@
 #include "core/fpdfapi/parser/cpdf_string.h"
 #include "core/fpdfdoc/cpdf_annot.h"
 #include "core/fpdfdoc/cpdf_annotlist.h"
+#include "core/fpdftext/cpdf_textpage.h"
 #include "fpdfsdk/cpdfsdk_helpers.h"
 #include "public/fpdf_formfill.h"
 #include "third_party/base/logging.h"
@@ -656,6 +657,46 @@ FPDFPageObj_SetLineCap(FPDF_PAGEOBJECT page_object, int line_cap) {
   return true;
 }
 
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object,
+                             FPDF_TEXTPAGE page,
+                             int char_start,
+                             int char_count,
+                             unsigned short* result)
+{
+  if (!page || !text_object || char_start < 0 || char_count < 0 || !result)
+    return 0;
+
+  CPDF_TextObject* pTxtObj = CPDFTextObjectFromFPDFPageObject(text_object);
+  CPDF_TextPage* textpage = CPDFTextPageFromFPDFTextPage(page);
+  int char_available = textpage->CountChars() - char_start;
+  if (char_available <= 0)
+    return 0;
+
+  char_count = std::min(char_count, char_available);
+  if (char_count == 0) {
+    // Writing out "", which has a character count of 1 due to the NUL.
+    *result = '\0';
+    return 1;
+  }
+
+  WideString str = textpage->GetTextObjectText(pTxtObj);
+
+  if (str.GetLength() > static_cast<size_t>(char_count))
+    str = str.Left(static_cast<size_t>(char_count));
+
+  // UFT16LE_Encode doesn't handle surrogate pairs properly, so it is expected
+  // the number of items to stay the same.
+  ByteString byte_str = str.UTF16LE_Encode();
+  size_t byte_str_len = byte_str.GetLength();
+  constexpr size_t kBytesPerCharacter = sizeof(unsigned short);
+  int ret_count = byte_str_len / kBytesPerCharacter;
+
+  ASSERT(ret_count <= char_count + 1);  // +1 to account for the NUL terminator.
+  memcpy(result, byte_str.GetBuffer(byte_str_len), byte_str_len);
+  return ret_count;
+}
+
 FPDF_EXPORT FPDF_BOOL FPDF_CALLCONV
 FPDFTextObj_GetColor(FPDF_PAGEOBJECT text_object,
                      unsigned int* R,
diff --git a/pdfium/public/fpdf_edit.h b/pdfium/public/fpdf_edit.h
index f249e64..e14b2a5 100644
--- a/pdfium/public/fpdf_edit.h
+++ b/pdfium/public/fpdf_edit.h
@@ -1088,6 +1088,19 @@ FPDFTextObj_CountChars(FPDF_PAGEOBJECT text_object);
 FPDF_EXPORT int FPDF_CALLCONV
 FPDFTextObj_GetFontSize(FPDF_PAGEOBJECT text_object);
 
+// Get the processed text of a text object.
+//
+// text_object - Handle of text object returned by FPDFPageObj_NewTextObj
+//               or FPDFPageObj_NewTextObjEx.
+// Return Value:
+// The number of characters (not bytes) written in result.
+FPDF_EXPORT int FPDF_CALLCONV
+FPDFTextObj_GetTextProcessed(FPDF_PAGEOBJECT text_object,
+                             FPDF_TEXTPAGE page,
+                             int char_start,
+                             int char_count,
+                             unsigned short* result);
+
 // Get the stroke RGBA of a text. Range of values: 0 - 255.
 //
 // path   - the handle to the path object.
-- 
2.16.3