Resolves: tdf#125110 tdf#151211 Disentangle the convoluted CSV/TSV-clip import

The chain of fixes for #i119960# tdf#48621 tdf#125440 produced code that is suboptimal and not robust enough against some further corner cases, taking quoted field content where there shouldn't be. First, in ReadCsvLine() assume that if a generator is broken enough to start a field quoted followed by containing an unescaped embedded quote and there is no closing quote (i.e. immediately before a field delimiter) until the line end then the generator will not be clever enough to write embedded linefeeds either and the field starting quote wasn't one but to be taken literally as all other quotes until the now unquoted field end. In this case do not read a subsequent source line for the current row. Then, for individual fields of a row make a similar assumption, a quote-started field has to end with a quote before a field separator (or line end) or otherwise all quotes of that field are literal data up to the next field separator. This made it necessary to adapt two test cases of the garbage CSV import test to produce different garbage than before. Change-Id: I4424b65c87c7f9dcbe717a7e6cf207352cb613f3 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/140850 Reviewed-by: Eike Rathke <erack@redhat.com> Tested-by: Jenkins
author: Eike Rathke <erack@redhat.com> 2022-10-02 01:16:00 +0200
committer: Eike Rathke <erack@redhat.com> 2022-10-02 17:07:06 +0200
commit: cf777cfcb22647b1f2d6ace307fbcc4f6d2cca30 (patch)
tree: 3859564c8389cc391a6f06c1e9bb04e76a1c7c44
parent: tdf#143158 - Handle double clicks on alphabetical indices (diff)
download: core-cf777cfcb22647b1f2d6ace307fbcc4f6d2cca30.tar.gz
core-cf777cfcb22647b1f2d6ace307fbcc4f6d2cca30.zip
2 files changed, 87 insertions, 23 deletions
diff --git a/sc/qa/unit/data/contentCSV/fdo48621_broken_quotes_exported.csv b/sc/qa/unit/data/contentCSV/fdo48621_broken_quotes_exported.csv
index dfc83c5f3ced..8e10063eefe5 100644
--- a/sc/qa/unit/data/contentCSV/fdo48621_broken_quotes_exported.csv
+++ b/sc/qa/unit/data/contentCSV/fdo48621_broken_quotes_exported.csv
@@ -53,8 +53,8 @@ No it doesn't,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 ",<- needed to end test file here,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 i80385_test2.csv,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-test,"a""b, ""a"," d""a""c ", m ,j ,d,"b""A""","D""E","f,1","a,b","de""b,a
-""abcdef"" test ""abc","def""g""h","def""gh""",,,,,,,,,,,,,,,,,,,,,,,
+test,"a""b, ""a"," d""a""c ", m ,j ,d,"b""A""","D""E","f,1","a,b","""de""b",a,,,,,,,,,,,,,,,,,,,,,,,,
+"abcdef"" test ""abc","def""g""h","def""gh""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 "this is
 a test","yes
 it
@@ -78,8 +78,8 @@ No it doesn't,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 ""a""b""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 i80385_test4.csv,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
-test,"a""b, ""a"," d""a""c ", m ,j ,d,"b""A""","D""E","f,1","a,b","de""b,a
-""abcdef"" test ""abc","def""g""h","def""gh""",,,,,,,,,,,,,,,,,,,,,,,
+test,"a""b, ""a"," d""a""c ", m ,j ,d,"b""A""","D""E","f,1","a,b","""de""b",a,,,,,,,,,,,,,,,,,,,,,,,,
+"abcdef"" test ""abc","def""g""h","def""gh""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 "this is
 a test","yes
 it
diff --git a/sc/source/ui/docshell/impex.cxx b/sc/source/ui/docshell/impex.cxx
index 59d8f9a3538c..2a7ed9da5d06 100644
--- a/sc/source/ui/docshell/impex.cxx
+++ b/sc/source/ui/docshell/impex.cxx
@@ -581,6 +581,11 @@ void ScImportExport::SetNoEndianSwap( SvStream& rStrm )
 #endif
 }
 
+static inline bool lcl_isFieldEnd( sal_Unicode c, const sal_Unicode* pSeps )
+{
+    return !c || ScGlobal::UnicodeStrChr( pSeps, c);
+}
+
 namespace {
 
 enum QuoteType
@@ -615,7 +620,7 @@ static QuoteType lcl_isFieldEndQuote( const sal_Unicode* p, const sal_Unicode* p
     const bool bBlankSep = (p[1] == cBlank && !rcDetectSep && p[2] && p[2] != cBlank);
     while (p[1] == cBlank)
         ++p;
-    if (!p[1] || ScGlobal::UnicodeStrChr( pSeps, p[1]))
+    if (lcl_isFieldEnd( p[1], pSeps))
         return FIELDEND_QUOTE;
     // Extended separator detection after a closing quote (with or without
     // blanks). Note that nQuotes is incremented *after* the call so is not yet
@@ -712,9 +717,30 @@ enum class DoubledQuoteMode
 
 }
 
-static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rString,
+/** Scan for a quoted string.
+
+    Precondition: initial current position *p is a cStr quote.
+
+    For DoubledQuoteMode::ESCAPE, if after the closing quote there is a field
+    end (with or without trailing blanks and as determined by
+    lcl_isFieldEndQuote()), then the content is appended to rField with quotes
+    processed and removed. Else if no field end after the quoted string was
+    detected, nothing is appended and processing continues and is repeated
+    until the next quote. If no closing quote at a field end was found at all,
+    nothing is appended and the initial position is returned and caller has to
+    decide, usually just taking all as literal data.
+
+    For DoubledQuoteMode::KEEP_ALL, the string up to and including the closing
+    quote is appended to rField and the next position returned, regardless
+    whether there is a field separator following or not.
+
+ */
+static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rField,
             const sal_Unicode* pSeps, sal_Unicode cStr, DoubledQuoteMode eMode, bool& rbOverflowCell )
 {
+    OUString aString;
+    bool bClosingQuote = (eMode == DoubledQuoteMode::KEEP_ALL);
+    const sal_Unicode* const pStart = p;
     if (eMode != DoubledQuoteMode::KEEP_ALL)
         p++;    //! jump over opening quote
     bool bCont;
@@ -724,8 +750,19 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rStrin
         const sal_Unicode* p0 = p;
         for( ;; )
         {
-            if( !*p )
-                break;
+            if (!*p)
+            {
+                // Encountering end of data after an opening quote is not a
+                // quoted string, ReadCsvLine() concatenated lines with '\n'
+                // for a properly quoted embedded linefeed.
+                if (eMode == DoubledQuoteMode::KEEP_ALL)
+                    // Caller would append that data anyway, so we can do it
+                    // already here.
+                    break;
+
+                return pStart;
+            }
+
             if( *p == cStr )
             {
                 if ( *++p != cStr )
@@ -735,7 +772,10 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rStrin
                     {
                         sal_Unicode cDetectSep = 0xffff;    // No separator detection here.
                         if (lcl_isFieldEndQuote( p-1, pSeps, cDetectSep) == FIELDEND_QUOTE)
+                        {
+                            bClosingQuote = true;
                             break;
+                        }
                         else
                             continue;
                     }
@@ -761,10 +801,17 @@ static const sal_Unicode* lcl_ScanString( const sal_Unicode* p, OUString& rStrin
         }
         if ( p0 < p )
         {
-            if (!lcl_appendLineData( rString, p0, ((eMode != DoubledQuoteMode::KEEP_ALL && (*p || *(p-1) == cStr)) ? p-1 : p)))
+            if (!lcl_appendLineData( aString, p0, ((eMode != DoubledQuoteMode::KEEP_ALL && (*p || *(p-1) == cStr)) ? p-1 : p)))
                 rbOverflowCell = true;
         }
     } while ( bCont );
+
+    if (!bClosingQuote)
+        return pStart;
+
+    if (!aString.isEmpty())
+        rField += aString;
+
     return p;
 }
 
@@ -950,18 +997,16 @@ bool ScImportExport::Text2Doc( SvStream& rStrm )
             {
                 aCell.clear();
                 const sal_Unicode* q = p;
-                while (*p && *p != cSep)
+                if (*p == cStr)
                 {
-                    // Always look for a pairing quote and ignore separator in between.
-                    while (*p && *p == cStr)
-                        q = p = lcl_ScanString( p, aCell, pSeps, cStr, mode, bOverflowCell );
-                    // All until next separator or quote.
-                    while (*p && *p != cSep && *p != cStr)
-                        ++p;
-                    if (!lcl_appendLineData( aCell, q, p))
-                        bOverflowCell = true;   // display warning on import
-                    q = p;
+                    // Look for a pairing quote.
+                    q = p = lcl_ScanString( p, aCell, pSeps, cStr, mode, bOverflowCell );
                 }
+                // All until next separator.
+                while (*p && *p != cSep)
+                    ++p;
+                if (!lcl_appendLineData( aCell, q, p))
+                    bOverflowCell = true;   // display warning on import
                 if (*p)
                     ++p;
                 if (rDoc.ValidCol(nCol) && rDoc.ValidRow(nRow) )
@@ -1825,7 +1870,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
         rbIsQuoted = true;
         const sal_Unicode* p1;
         p1 = p = lcl_ScanString( p, rField, pSeps, cStr, DoubledQuoteMode::ESCAPE, rbOverflowCell );
-        while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
+        while (!lcl_isFieldEnd( *p, pSeps))
             p++;
         // Append remaining unquoted and undelimited data (dirty, dirty) to
         // this field.
@@ -1846,7 +1891,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
     else                        // up to delimiter
     {
         const sal_Unicode* p0 = p;
-        while ( *p && !ScGlobal::UnicodeStrChr( pSeps, *p ) )
+        while (!lcl_isFieldEnd( *p, pSeps))
             p++;
         const sal_Unicode* ptrim_i = p0;
         const sal_Unicode* ptrim_f = p;  // [ptrim_i,ptrim_f) is cell data after trimming
@@ -1864,7 +1909,7 @@ const sal_Unicode* ScImportExport::ScanNextFieldFromString( const sal_Unicode* p
     }
     if ( bMergeSeps )           // skip following delimiters
     {
-        while ( *p && ScGlobal::UnicodeStrChr( pSeps, *p ) )
+        while (!lcl_isFieldEnd( *p, pSeps))
             p++;
     }
     return p;
@@ -2694,6 +2739,8 @@ Label_RetryWithNewSep:
 
     if (bEmbeddedLineBreak)
     {
+        sal_Int32 nFirstLineLength = aStr.getLength();
+        sal_uInt64 nFirstLineStreamPos = rStream.Tell();
         sal_uInt32 nLine = 0;
 
         const sal_Unicode* pSeps = rFieldSeparators.getStr();
@@ -2726,6 +2773,8 @@ Label_RetryWithNewSep:
                             ++nQuotes;
                             bFieldStart = false;
                             eQuoteState = FIELDSTART_QUOTE;
+                            nFirstLineLength = aStr.getLength();
+                            nFirstLineStreamPos = rStream.Tell();
                         }
                         // Do not detect a FIELDSTART_QUOTE if not in
                         // bFieldStart mode, in which case for unquoted content
@@ -2766,6 +2815,8 @@ Label_RetryWithNewSep:
                         nQuotes = 1;
                         eQuoteState = FIELDSTART_QUOTE;
                         bFieldStart = false;
+                        nFirstLineLength = aStr.getLength();
+                        nFirstLineStreamPos = rStream.Tell();
                     }
                     else if (eQuoteState == FIELDEND_QUOTE)
                     {
@@ -2789,6 +2840,11 @@ Label_RetryWithNewSep:
                 // nArbitraryLineLengthLimit (or nMaxSourceLines below) we
                 // split a string right between a doubled quote pair.
                 break;
+            else if (eQuoteState == DONTKNOW_QUOTE)
+                // A single unescaped quote somewhere in a quote started
+                // field, most likely that was not meant to have embedded
+                // linefeeds either.
+                break;
             else if (++nLine >= nMaxSourceLines && nMaxSourceLines > 0)
                 // Unconditionally increment nLine even if nMaxSourceLines==0
                 // so it can be observed in debugger.
@@ -2798,9 +2854,17 @@ Label_RetryWithNewSep:
                 nLastOffset = aStr.getLength();
                 OUString aNext;
                 rStream.ReadUniOrByteStringLine(aNext, rStream.GetStreamCharSet(), nArbitraryLineLengthLimit);
-                aStr += "\n" + aNext;
+                if (!rStream.eof())
+                    aStr += "\n" + aNext;
             }
         }
+        if (nQuotes & 1)
+        {
+            // No closing quote at all. A single quote at field start => no
+            // embedded linefeeds for that field, take only first logical line.
+            aStr = aStr.copy( 0, nFirstLineLength);
+            rStream.Seek( nFirstLineStreamPos);
+        }
     }
     return aStr;
 }
author	Eike Rathke <erack@redhat.com>	2022-10-02 01:16:00 +0200
committer	Eike Rathke <erack@redhat.com>	2022-10-02 17:07:06 +0200
commit	cf777cfcb22647b1f2d6ace307fbcc4f6d2cca30 (patch)
tree	3859564c8389cc391a6f06c1e9bb04e76a1c7c44
parent	tdf#143158 - Handle double clicks on alphabetical indices (diff)
download	core-cf777cfcb22647b1f2d6ace307fbcc4f6d2cca30.tar.gz core-cf777cfcb22647b1f2d6ace307fbcc4f6d2cca30.zip