summaryrefslogtreecommitdiffstats
path: root/external/pdfium/0001-Add-FPDFTextObj_GetText-API.patch.1
blob: 22926462cdacc46ba9334c761d9fa6430461ef50 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
From 3bee9c60f013b8b7e99c39ee35699d132b330334 Mon Sep 17 00:00:00 2001
Date: Tue, 7 Aug 2018 21:45:34 +0000
Subject: [PATCH] Add FPDFTextObj_GetText() API

Generalize CPDF_TextPage::GetTextByRect(), so that it's possible to get
the text from a text page using a predicate, that way we can easily
get the text that belongs to single text object as well.

Change-Id: Ia457af0f41184694dc1481709be72b35685bce7f
Reviewed-on: https://pdfium-review.googlesource.com/39530
Reviewed-by: Henrique Nakashima <hnakashima@chromium.org>
Reviewed-by: Lei Zhang <thestig@chromium.org>
Commit-Queue: Lei Zhang <thestig@chromium.org>
---
 core/fpdftext/cpdf_textpage.cpp    | 18 +++++++++++++--
 core/fpdftext/cpdf_textpage.h      |  4 ++++
 fpdfsdk/fpdf_edittext.cpp          | 18 +++++++++++++++
 fpdfsdk/fpdf_text_embeddertest.cpp | 45 ++++++++++++++++++++++++++++++++++++++
 fpdfsdk/fpdf_view_c_api_test.c     |  1 +
 public/fpdf_edit.h                 | 20 +++++++++++++++++
 6 files changed, 104 insertions(+), 2 deletions(-)

diff --git a/core/fpdftext/cpdf_textpage.cpp b/core/fpdftext/cpdf_textpage.cpp
index 289416043..ed7f36fb6 100644
--- a/core/fpdftext/cpdf_textpage.cpp
+++ b/core/fpdftext/cpdf_textpage.cpp
@@ -426,7 +426,8 @@ int CPDF_TextPage::GetIndexAtPos(const CFX_PointF& point,
   return pos < nCount ? pos : NearPos;
 }
 
-WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
+WideString CPDF_TextPage::GetTextByPredicate(
+    const std::function<bool(const PAGECHAR_INFO&)>& predicate) const {
   if (!m_bIsParsed)
     return WideString();
 
@@ -435,7 +436,7 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
   bool IsAddLineFeed = false;
   WideString strText;
   for (const auto& charinfo : m_CharList) {
-    if (IsRectIntersect(rect, charinfo.m_CharBox)) {
+    if (predicate(charinfo)) {
       if (fabs(posy - charinfo.m_Origin.y) > 0 && !IsContainPreChar &&
           IsAddLineFeed) {
         posy = charinfo.m_Origin.y;
@@ -460,6 +461,19 @@ WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
   return strText;
 }
 
+WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
+  return GetTextByPredicate([&rect](const PAGECHAR_INFO& charinfo) {
+    return IsRectIntersect(rect, charinfo.m_CharBox);
+  });
+}
+
+WideString CPDF_TextPage::GetTextByObject(
+    const CPDF_TextObject* pTextObj) const {
+  return GetTextByPredicate([pTextObj](const PAGECHAR_INFO& charinfo) {
+    return charinfo.m_pTextObj == pTextObj;
+  });
+}
+
 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO* info) const {
   if (!m_bIsParsed || !pdfium::IndexInBounds(m_CharList, index))
     return;
diff --git a/core/fpdftext/cpdf_textpage.h b/core/fpdftext/cpdf_textpage.h
index 36d01854f..90b45bd96 100644
--- a/core/fpdftext/cpdf_textpage.h
+++ b/core/fpdftext/cpdf_textpage.h
@@ -8,6 +8,7 @@
 #define CORE_FPDFTEXT_CPDF_TEXTPAGE_H_
 
 #include <deque>
+#include <functional>
 #include <vector>
 
 #include "core/fpdfapi/page/cpdf_pageobjectlist.h"
@@ -97,6 +98,7 @@ class CPDF_TextPage {
   std::vector<CFX_FloatRect> GetRectArray(int start, int nCount) const;
   int GetIndexAtPos(const CFX_PointF& point, const CFX_SizeF& tolerance) const;
   WideString GetTextByRect(const CFX_FloatRect& rect) const;
+  WideString GetTextByObject(const CPDF_TextObject* pTextObj) const;
 
   // Returns string with the text from |m_TextBuf| that are covered by the input
   // range. |start| and |count| are in terms of the |m_CharIndex|, so the range
@@ -151,6 +153,8 @@ class CPDF_TextPage {
   TextOrientation FindTextlineFlowOrientation() const;
   void AppendGeneratedCharacter(wchar_t unicode, const CFX_Matrix& formMatrix);
   void SwapTempTextBuf(int32_t iCharListStartAppend, int32_t iBufStartAppend);
+  WideString GetTextByPredicate(
+      const std::function<bool(const PAGECHAR_INFO&)>& predicate) const;
 
   UnownedPtr<const CPDF_Page> const m_pPage;
   std::vector<uint16_t> m_CharIndex;
diff --git a/fpdfsdk/fpdf_edittext.cpp b/fpdfsdk/fpdf_edittext.cpp
index 6aa44b3b2..2773763b9 100644
--- a/fpdfsdk/fpdf_edittext.cpp
+++ b/fpdfsdk/fpdf_edittext.cpp
@@ -22,6 +22,7 @@
 #include "core/fpdfapi/parser/cpdf_number.h"
 #include "core/fpdfapi/parser/cpdf_reference.h"
 #include "core/fpdfapi/parser/cpdf_stream.h"
+#include "core/fpdftext/cpdf_textpage.h"
 #include "core/fxcrt/fx_extension.h"
 #include "core/fxge/cfx_fontmgr.h"
 #include "core/fxge/fx_font.h"
@@ -564,6 +565,23 @@ FPDFTextObj_GetFontName(FPDF_PAGEOBJECT text,
   return dwStringLen;
 }
 
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+                    FPDF_TEXTPAGE text_page,
+                    void* buffer,
+                    unsigned long length) {
+  CPDF_TextObject* pTextObj = CPDFTextObjectFromFPDFPageObject(text_object);
+  if (!pTextObj)
+    return 0;
+
+  CPDF_TextPage* pTextPage = CPDFTextPageFromFPDFTextPage(text_page);
+  if (!pTextPage)
+    return 0;
+
+  WideString text = pTextPage->GetTextByObject(pTextObj);
+  return Utf16EncodeMaybeCopyAndReturnLength(text, buffer, length);
+}
+
 FPDF_EXPORT void FPDF_CALLCONV FPDFFont_Close(FPDF_FONT font) {
   CPDF_Font* pFont = CPDFFontFromFPDFFont(font);
   if (!pFont)
diff --git a/public/fpdf_edit.h b/public/fpdf_edit.h
index 4d5aa9c48..83fedba90 100644
--- a/public/fpdf_edit.h
+++ b/public/fpdf_edit.h
@@ -1274,6 +1274,26 @@ FPDFTextObj_GetFontName(FPDF_PAGEOBJECT text,
                         void* buffer,
                         unsigned long length);
 
+// Experimental API.
+// Get the text of a text object.
+//
+// text_object      - the handle to the text object.
+// text_page        - the handle to the text page.
+// buffer           - the address of a buffer that receives the text.
+// length           - the size, in bytes, of |buffer|.
+//
+// Returns the number of bytes in the text (including the trailing NUL
+// character) on success, 0 on error.
+//
+// Regardless of the platform, the |buffer| is always in UTF16-LE encoding.
+// If |length| is less than the returned length, or |buffer| is NULL, |buffer|
+// will not be modified.
+FPDF_EXPORT unsigned long FPDF_CALLCONV
+FPDFTextObj_GetText(FPDF_PAGEOBJECT text_object,
+                    FPDF_TEXTPAGE text_page,
+                    void* buffer,
+                    unsigned long length);
+
 // Experimental API.
 // Get number of page objects inside |form_object|.
 //
-- 
2.16.4