summaryrefslogtreecommitdiffstats
path: root/svl/source/misc/sharedstringpool.cxx
blob: d88b0a2c22efe42a941452edcd00f196368589cb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
/*
 * This file is part of the LibreOffice project.
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
 */

#include <svl/sharedstringpool.hxx>
#include <svl/sharedstring.hxx>
#include <unotools/charclass.hxx>

#include <mutex>
#include <unordered_map>
#include <unordered_set>

#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wshadow"
#endif
#if defined __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-parameter"
#endif
#if defined _MSC_VER
#pragma warning(push)
#pragma warning(disable : 4324) // structure was padded due to alignment specifier
#endif
#include <libcuckoo/cuckoohash_map.hh>
#if defined _MSC_VER
#pragma warning(pop)
#endif
#if defined __clang__
#pragma clang diagnostic pop
#endif
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif

namespace svl
{
namespace
{
sal_Int32 getRefCount(const rtl_uString* p) { return (p->refCount & 0x3FFFFFFF); }

// we store the key twice, because the concurrent hashtable we are using does not provide any way to return the key in use
typedef std::pair<OUString, OUString> Mapped;

struct HashFunction
{
    size_t operator()(rtl_uString* const key) const
    {
        return rtl_ustr_hashCode_WithLength(key->buffer, key->length);
    }
};

struct EqualsFunction
{
    bool operator()(rtl_uString* const lhs, rtl_uString* const rhs) const
    {
        return OUString::unacquired(&lhs) == OUString::unacquired(&rhs);
    }
};
}

struct SharedStringPool::Impl
{
    // We use this map for two purposes - to store lower->upper case mappings
    // and to store an upper->upper mapping.
    // The second mapping is used so that we can
    // share the same rtl_uString object between different keys which map to the same uppercase string to save memory.
    //
    // Docs for this concurrent hashtable here: http://efficient.github.io/libcuckoo/classlibcuckoo_1_1cuckoohash__map.html
    libcuckoo::cuckoohash_map<rtl_uString*, Mapped, HashFunction, EqualsFunction> maStrMap;
    const CharClass& mrCharClass;

    explicit Impl(const CharClass& rCharClass)
        : mrCharClass(rCharClass)
    {
    }
};

SharedStringPool::SharedStringPool(const CharClass& rCharClass)
    : mpImpl(new Impl(rCharClass))
{
}

SharedStringPool::~SharedStringPool() {}

SharedString SharedStringPool::intern(const OUString& rStr)
{
    auto& rMap = mpImpl->maStrMap;

    rtl_uString *pResultLower = {}, *pResultUpper = {}; // bogus GCC 12 -Werror=maybe-uninitialized
    if (rMap.find_fn(rStr.pData, [&](const Mapped& rMapped) {
            pResultLower = rMapped.first.pData;
            pResultUpper = rMapped.second.pData;
        }))
        // there is already a mapping
        return SharedString(pResultLower, pResultUpper);

    // This is a new string insertion. Establish mapping to upper-case variant.
    OUString aUpper = mpImpl->mrCharClass.uppercase(rStr);

    // either insert a new upper->upper mapping, or write the existing mapping into aUpper
    mpImpl->maStrMap.uprase_fn(aUpper.pData,
                               [&](Mapped& mapped) -> bool {
                                   aUpper = mapped.second;
                                   return false;
                               },
                               aUpper, aUpper);

    if (aUpper == rStr)
        // no need to do anything more, because the key is already uppercase
        return SharedString(aUpper.pData, aUpper.pData);

    // either insert a new lower->upper mapping, or write the existing mapping into aLower
    if (mpImpl->maStrMap.uprase_fn(rStr.pData,
                                   [&](Mapped& mapped) -> bool {
                                       pResultLower = mapped.first.pData;
                                       pResultUpper = mapped.second.pData;
                                       return false;
                                   },
                                   rStr, aUpper))
    {
        pResultLower = rStr.pData;
        pResultUpper = aUpper.pData;
    }

    return SharedString(pResultLower, pResultUpper);
}

void SharedStringPool::purge()
{
    auto locked_table = mpImpl->maStrMap.lock_table();

    // Because we can have an uppercase entry mapped to itself,
    // and then a bunch of lowercase entries mapped to that same
    // upper-case entry, we need to scan the map twice - the first
    // time to remove lowercase entries, and then only can we
    // check for unused uppercase entries.

    auto it = locked_table.begin();
    auto itEnd = locked_table.end();
    while (it != itEnd)
    {
        rtl_uString* p1 = it->second.first.pData;
        rtl_uString* p2 = it->second.second.pData;
        if (p1 != p2)
        {
            // normal case - lowercase mapped to uppercase, which
            // means that the lowercase entry has one ref-counted
            // entry as the key in the map
            if (getRefCount(p1) == 1)
            {
                it = locked_table.erase(it);
                continue;
            }
        }
        ++it;
    }

    it = locked_table.begin();
    itEnd = locked_table.end();
    while (it != itEnd)
    {
        rtl_uString* p1 = it->second.first.pData;
        rtl_uString* p2 = it->second.second.pData;
        if (p1 == p2)
        {
            // uppercase which is mapped to itself, which means
            // one ref-counted entry as the key in the map, and
            // one ref-counted entry in the value in the map
            if (getRefCount(p1) == 2)
            {
                it = locked_table.erase(it);
                continue;
            }
        }
        ++it;
    }
}

size_t SharedStringPool::getCount() const { return mpImpl->maStrMap.size(); }

size_t SharedStringPool::getCountIgnoreCase() const
{
    // this is only called from unit tests, so no need to be efficient
    std::unordered_set<OUString> aUpperSet;
    auto locked_table = mpImpl->maStrMap.lock_table();
    for (auto const& pair : locked_table)
        aUpperSet.insert(pair.second.second);
    return aUpperSet.size();
}
}

/* vim:set shiftwidth=4 softtabstop=4 expandtab: */