Add support for ignoring diacritics while searching and filtering lists

This commit is contained in:
Andrzej Rybczak
2017-03-29 21:38:14 +02:00
parent 29403d41ab
commit 66912d73da
14 changed files with 109 additions and 48 deletions

View File

@@ -25,6 +25,8 @@
#ifdef BOOST_REGEX_ICU
# include <boost/regex/icu.hpp>
# include <unicode/errorcode.h>
# include <unicode/translit.h>
#else
# include <boost/regex.hpp>
#endif // BOOST_REGEX_ICU
@@ -32,6 +34,39 @@
#include <cassert>
#include <iostream>
#include "utility/functional.h"
namespace {
#ifdef BOOST_REGEX_ICU
struct StripDiacritics
{
static void convert(UnicodeString &s)
{
if (m_converter == nullptr)
{
ErrorCode result;
m_converter = Transliterator::createInstance(
"NFD; [:M:] Remove; NFC", UTRANS_FORWARD, result);
if (result.isFailure())
throw std::runtime_error(
"instantiation of transliterator instance failed with "
+ std::string(result.errorName()));
}
m_converter->transliterate(s);
}
private:
static Transliterator *m_converter;
};
Transliterator *StripDiacritics::m_converter;
#endif // BOOST_REGEX_ICU
}
namespace Regex {
typedef
@@ -43,31 +78,44 @@ typedef
Regex;
template <typename StringT>
inline Regex make(StringT &&s, boost::regex_constants::syntax_option_type flags)
inline Regex make(StringT &&s,
boost::regex_constants::syntax_option_type flags)
{
return
# ifdef BOOST_REGEX_ICU
#ifdef BOOST_REGEX_ICU
boost::make_u32regex
# else
#else
boost::regex
# endif // BOOST_REGEX_ICU
#endif // BOOST_REGEX_ICU
(std::forward<StringT>(s), flags);
}
template <typename StringT>
inline bool search(StringT &&s, const Regex &rx)
template <typename CharT>
inline bool search(const std::basic_string<CharT> &s,
const Regex &rx,
bool ignore_diacritics)
{
try {
return
# ifdef BOOST_REGEX_ICU
boost::u32regex_search
# else
boost::regex_search
# endif // BOOST_REGEX_ICU
(std::forward<StringT>(s), rx);
#ifdef BOOST_REGEX_ICU
if (ignore_diacritics)
{
auto us = UnicodeString::fromUTF8(
StringPiece(convertString<char, CharT>::apply(s)));
StripDiacritics::convert(us);
return boost::u32regex_search(us, rx);
}
else
return boost::u32regex_search(s, rx);
#else
return boost::regex_search(s, rx);
#endif // BOOST_REGEX_ICU
} catch (std::out_of_range &e) {
// Invalid UTF-8 sequence, ignore the string.
std::cerr << "Regex::search: error while processing \"" << s << "\": " << e.what() << "\n";
std::cerr << "Regex::search: error while processing \""
<< s
<< "\": "
<< e.what()
<< "\n";
return false;
}
}