summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEike Rathke <erack@redhat.com>2021-08-18 19:05:08 +0200
committerCaolán McNamara <caolanm@redhat.com>2021-08-20 09:49:08 +0200
commit8bcfa0829bc5741141d1c491a527bc53b51b4f33 (patch)
tree7870f0950b4404b6b82d1d470f273dbcfdc26e09
parenttdf#143624 PPTX: slideshow setting “Manual forwarding” not exported (diff)
downloadcore-8bcfa0829bc5741141d1c491a527bc53b51b4f33.tar.gz
core-8bcfa0829bc5741141d1c491a527bc53b51b4f33.zip
Resolves: tdf#102846 CSV: Detect separator, limit preview line concatenations
In CSV import preview, if a line starts with a quote character and none of the remembered last field separators used occur in data in conjunction with a closing quote, then reading data tried to concatenate line by line to form a data field to be presented in the preview, worst case the entire file.. For the preview, detect one possible not yet selected separator if used with a quoted field (similar to commit c807e7ea7a0725a4d8375eda07d6f70870e0d50a for tdf#56910 space separator) and limit the number of source lines that are tried to be concatenated if no separator was encountered after a possibly closing field quote. Change-Id: Iefd37a8301161e72cb607cea88d4faadad47b4ae Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120690 Reviewed-by: Eike Rathke <erack@redhat.com> Tested-by: Jenkins (cherry picked from commit ff62e0165a0add7c7e3cb606df5b24b20c822d8a) Reviewed-on: https://gerrit.libreoffice.org/c/core/+/120717 Reviewed-by: Caolán McNamara <caolanm@redhat.com>
-rw-r--r--sc/source/ui/dbgui/scuiasciiopt.cxx40
-rw-r--r--sc/source/ui/docshell/impex.cxx50
-rw-r--r--sc/source/ui/inc/impex.hxx18
-rw-r--r--sc/source/ui/inc/scuiasciiopt.hxx2
4 files changed, 85 insertions, 25 deletions
diff --git a/sc/source/ui/dbgui/scuiasciiopt.cxx b/sc/source/ui/dbgui/scuiasciiopt.cxx
index b66403492961..a95d8bdfff7a 100644
--- a/sc/source/ui/dbgui/scuiasciiopt.cxx
+++ b/sc/source/ui/dbgui/scuiasciiopt.cxx
@@ -44,6 +44,14 @@
//! TODO make dynamic
const SCSIZE ASCIIDLG_MAXROWS = MAXROWCOUNT;
+// Maximum number of source lines to concatenate while generating the preview
+// for one logical line. This may result in a wrong preview if the actual
+// number of embedded line feeds is greater, but a number too high would take
+// too much time (loop excessively if unlimited and large data) if none of the
+// selected separators are actually used in data but a field at start of line
+// is quoted.
+constexpr sal_uInt32 kMaxEmbeddedLinefeeds = 500;
+
using namespace com::sun::star::uno;
namespace {
@@ -293,7 +301,7 @@ ScImportAsciiDlg::ScImportAsciiDlg(weld::Window* pParent, const OUString& aDatNa
, mnRowPosCount(0)
, mcTextSep(ScAsciiOptions::cDefaultTextSep)
, meCall(eCall)
- , mbDetectSpaceSep(eCall != SC_TEXTTOCOLUMNS)
+ , mbDetectSep(eCall != SC_TEXTTOCOLUMNS)
, mxFtCharSet(m_xBuilder->weld_label("textcharset"))
, mxLbCharSet(new SvxTextEncodingBox(m_xBuilder->weld_combo_box("charset")))
, mxFtCustomLang(m_xBuilder->weld_label("textlanguage"))
@@ -579,7 +587,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r
break;
}
rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators,
- mcTextSep, rcDetectSep);
+ mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
mnStreamPos = mpDatStream->Tell();
mpRowPosArray[++mnRowPosCount] = mnStreamPos;
} while (nLine >= mnRowPosCount && mpDatStream->good());
@@ -594,7 +602,7 @@ bool ScImportAsciiDlg::GetLine( sal_uLong nLine, OUString &rText, sal_Unicode& r
else
{
Seek( mpRowPosArray[nLine]);
- rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep);
+ rText = ReadCsvLine(*mpDatStream, !bFixed, maFieldSeparators, mcTextSep, rcDetectSep, kMaxEmbeddedLinefeeds);
mnStreamPos = mpDatStream->Tell();
}
@@ -793,7 +801,11 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
// when the dialog wasn't already presented to the user.
// As a side effect this has the benefit that the check is only done on the
// first set of visible lines.
- sal_Unicode cDetectSep = (mbDetectSpaceSep && !mxRbFixed->get_active() && !mxCkbSpace->get_active() ? 0 : 0xffff);
+ sal_Unicode cDetectSep = ((mbDetectSep && !mxRbFixed->get_active()
+ && (!mxCkbTab->get_active() || !mxCkbSemicolon->get_active()
+ || !mxCkbComma->get_active() || !mxCkbSpace->get_active())) ? 0 : 0xffff);
+ if (cDetectSep == 0xffff)
+ mbDetectSep = false;
sal_Int32 nBaseLine = mxTableBox->GetGrid().GetFirstVisLine();
sal_Int32 nRead = mxTableBox->GetGrid().GetVisLineCount();
@@ -813,16 +825,22 @@ IMPL_LINK_NOARG(ScImportAsciiDlg, UpdateTextHdl, ScCsvTableBox&, void)
for (; i < CSV_PREVIEW_LINES; i++)
maPreviewLine[i].clear();
- if (mbDetectSpaceSep)
+ if (mbDetectSep)
{
- mbDetectSpaceSep = false;
- if (cDetectSep == ' ')
+ mbDetectSep = false;
+ if (cDetectSep)
{
- // Expect space to be appended by now so all subsequent
+ // Expect separator to be appended by now so all subsequent
// GetLine()/ReadCsvLine() actually used it.
- assert(maFieldSeparators.endsWith(" "));
- // Preselect Space in UI.
- mxCkbSpace->set_active(true);
+ assert(maFieldSeparators.endsWith(OUStringChar(cDetectSep)));
+ // Preselect separator in UI.
+ switch (cDetectSep)
+ {
+ case '\t': mxCkbTab->set_active(true); break;
+ case ';': mxCkbSemicolon->set_active(true); break;
+ case ',': mxCkbComma->set_active(true); break;
+ case ' ': mxCkbSpace->set_active(true); break;
+ }
}
}
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 6297fba3a1a7..f271a70feee9 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -606,17 +606,40 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
// Due to broken CSV generators that don't double embedded quotes check if
// a field separator immediately or with trailing spaces follows the quote,
// only then end the field, or at end of string.
- const sal_Unicode cBlank = ' ';
+ constexpr sal_Unicode cBlank = ' ';
if (p[1] == cBlank && ScGlobal::UnicodeStrChr( pSeps, cBlank))
return FIELDEND_QUOTE;
// Detect a possible blank separator if it's not already in the list (which
// was checked right above for p[1]==cBlank).
- if (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank)
- rcDetectSep = cBlank;
+ const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank);
while (p[1] == cBlank)
++p;
if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
return FIELDEND_QUOTE;
+ // Extended separator detection after a closing quote (with or without
+ // blanks). Note that nQuotes is incremented *after* the call so is not yet
+ // even here, and that with separator detection we reach here only if
+ // lcl_isEscapedOrFieldEndQuote() did not already detect FIRST_QUOTE or
+ // SECOND_QUOTE for an escaped embedded quote, thus nQuotes does not have
+ // to be checked.
+ if (!rcDetectSep)
+ {
+ constexpr sal_Unicode vSep[] = { ',', '\t', ';' };
+ for (const sal_Unicode c : vSep)
+ {
+ if (p[1] == c)
+ {
+ rcDetectSep = c;
+ return FIELDEND_QUOTE;
+ }
+ }
+ }
+ // Blank separator is least significant, after others.
+ if (bBlankSep)
+ {
+ rcDetectSep = cBlank;
+ return FIELDEND_QUOTE;
+ }
return DONTKNOW_QUOTE;
}
@@ -644,7 +667,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
static QuoteType lcl_isEscapedOrFieldEndQuote( sal_Int32 nQuotes, const sal_Unicode* p,
const sal_Unicode* pSeps, sal_Unicode cStr, sal_Unicode& rcDetectSep )
{
- if ((nQuotes % 2) == 0)
+ if ((nQuotes & 1) == 0)
{
if (p[-1] == cStr)
return SECOND_QUOTE;
@@ -2480,7 +2503,7 @@ ScImportStringStream::ScImportStringStream( const OUString& rStr )
}
OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
- OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep )
+ OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep, sal_uInt32 nMaxSourceLines )
{
enum RetryState
{
@@ -2505,6 +2528,8 @@ Label_RetryWithNewSep:
if (bEmbeddedLineBreak)
{
+ sal_uInt32 nLine = 0;
+
const sal_Unicode* pSeps = rFieldSeparators.getStr();
QuoteType eQuoteState = FIELDEND_QUOTE;
@@ -2543,10 +2568,11 @@ Label_RetryWithNewSep:
{
eQuoteState = lcl_isEscapedOrFieldEndQuote( nQuotes, p, pSeps, cFieldQuote, rcDetectSep);
- if (eRetryState == RetryState::ALLOW && rcDetectSep == ' ')
+ if (eRetryState == RetryState::ALLOW && rcDetectSep)
{
eRetryState = RetryState::RETRY;
- rFieldSeparators += " ";
+ rFieldSeparators += OUStringChar(rcDetectSep);
+ pSeps = rFieldSeparators.getStr();
goto Label_RetryWithNewSep;
}
@@ -2592,10 +2618,14 @@ Label_RetryWithNewSep:
++p;
}
- if (nQuotes % 2 == 0)
+ if ((nQuotes & 1) == 0)
// We still have a (theoretical?) problem here if due to
- // nArbitraryLineLengthLimit we split a string right between a
- // doubled quote pair.
+ // nArbitraryLineLengthLimit (or nMaxSourceLines below) we
+ // split a string right between a doubled quote pair.
+ break;
+ else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0)
+ // Unconditionally increment nLine even if nMaxSourceLines==0
+ // so it can be observed in debugger.
break;
else
{
diff --git a/sc/source/ui/inc/impex.hxx b/sc/source/ui/inc/impex.hxx
index b334755ca011..031a0ca3c571 100644
--- a/sc/source/ui/inc/impex.hxx
+++ b/sc/source/ui/inc/impex.hxx
@@ -194,14 +194,25 @@ public:
The quote character used.
@param rcDetectSep
- If 0 then attempt to detect a possible space (blank) separator if
+ If 0 then attempt to detect a possible separator if
rFieldSeparators doesn't include it already. This can be necessary because
of the "accept broken misquoted CSV fields" feature that tries to ignore
trailing blanks after a quoted field and if no separator follows continues
to add content to the field assuming the single double quote was in error.
- If this blank separator is detected it is added to rFieldSeparators and the
+ It is also necessary if the only possible separator was not selected and
+ not included in rFieldSeparators and a line starts with a quoted field, in
+ which case appending lines is tried until end of file.
+ If a separator is detected it is added to rFieldSeparators and the
line is reread with the new separators
+ @param nMaxSourceLines
+ Maximum source lines to read and combine into one logical line for embedded
+ new line purpose. Should be limited for the preview dialog because only
+ non-matching separators selected otherwise would lead to trying to
+ concatenate lines until file end.
+ If 0 no limit other than the internal arbitrary resulting line length
+ limit.
+
check Stream::good() to detect IO problems during read
@ATTENTION
@@ -222,6 +233,7 @@ public:
*/
SC_DLLPUBLIC OUString ReadCsvLine( SvStream &rStream, bool bEmbeddedLineBreak,
- OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep );
+ OUString& rFieldSeparators, sal_Unicode cFieldQuote, sal_Unicode& rcDetectSep,
+ sal_uInt32 nMaxSourceLines = 0 );
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/sc/source/ui/inc/scuiasciiopt.hxx b/sc/source/ui/inc/scuiasciiopt.hxx
index 140ca09122d0..eae2f2f06bc0 100644
--- a/sc/source/ui/inc/scuiasciiopt.hxx
+++ b/sc/source/ui/inc/scuiasciiopt.hxx
@@ -44,7 +44,7 @@ class ScImportAsciiDlg : public weld::GenericDialogController
rtl_TextEncoding meCharSet; /// Selected char set.
bool mbCharSetSystem; /// Is System char set selected?
ScImportAsciiCall meCall; /// How the dialog is called (see asciiopt.hxx)
- bool mbDetectSpaceSep; /// Whether to detect a possible space separator.
+ bool mbDetectSep; /// Whether to detect a possible separator.
std::unique_ptr<weld::Label> mxFtCharSet;
std::unique_ptr<SvxTextEncodingBox> mxLbCharSet;