Further improve HTML formatting

This commit is contained in:
Andrzej Rybczak
2016-11-13 07:22:06 +01:00
parent dbf5a1dbd8
commit e650b145df
3 changed files with 18 additions and 38 deletions

View File

@@ -106,6 +106,7 @@ std::vector<std::string> LyricsFetcher::getContent(const char *regex_, const std
void LyricsFetcher::postProcess(std::string &data) const
{
data = unescapeHtmlUtf8(data);
stripHtmlTags(data);
// Remove indentation from each line and collapse multiple newlines into one.
std::vector<std::string> lines;
@@ -157,7 +158,6 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
data.clear();
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
{
boost::replace_all(*it, "<br />", "\n");
stripHtmlTags(*it);
boost::trim(*it);
if (!it->empty())
@@ -226,34 +226,6 @@ bool GoogleLyricsFetcher::isURLOk(const std::string &url)
/**********************************************************************/
void Sing365Fetcher::postProcess(std::string &data) const
{
// throw away ad
data = boost::regex_replace(data, boost::regex("<div.*</div>"), "");
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
void JustSomeLyricsFetcher::postProcess(std::string &data) const
{
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
void MetrolyricsFetcher::postProcess(std::string &data) const
{
// some of lyrics have both \n chars and <br />, html tags
// are always present whereas \n chars are not, so we need to
// throw them away to avoid having line breaks doubled.
boost::replace_all(data, "&#10;", "");
boost::replace_all(data, "<br />", "\n");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
bool MetrolyricsFetcher::isURLOk(const std::string &url)
{
// it sometimes return link to sitemap.xml, which is huge so we need to discard it

View File

@@ -82,7 +82,6 @@ protected:
virtual const char *regex() const OVERRIDE { return "<div class=\"lyrics-body\">(.*?)</div>"; }
virtual bool isURLOk(const std::string &url) OVERRIDE;
virtual void postProcess(std::string &data) const OVERRIDE;
};
struct LyricsmaniaFetcher : public GoogleLyricsFetcher
@@ -99,8 +98,6 @@ struct Sing365Fetcher : public GoogleLyricsFetcher
protected:
virtual const char *regex() const OVERRIDE { return "<!-Lyrics Begin->(.*?)<!-Lyrics End->"; }
virtual void postProcess(std::string &data) const OVERRIDE;
};
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
@@ -109,8 +106,6 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
protected:
virtual const char *regex() const OVERRIDE { return "<div class=\"content.*?</div>\\s*</div>(.*?)<div"; }
virtual void postProcess(std::string &data) const OVERRIDE;
};
struct AzLyricsFetcher : public GoogleLyricsFetcher

View File

@@ -18,6 +18,7 @@
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include <algorithm>
#include <boost/algorithm/string/replace.hpp>
#include "utility/html.h"
@@ -58,20 +59,32 @@ void unescapeHtmlEntities(std::string &s)
boost::replace_all(s, "&lt;", "<");
boost::replace_all(s, "&nbsp;", " ");
boost::replace_all(s, "&quot;", "\"");
boost::replace_all(s, "&ndash;", "");
boost::replace_all(s, "&mdash;", "");
}
void stripHtmlTags(std::string &s)
{
bool is_p, is_slash_p;
// Erase newlines so they don't duplicate with HTML ones.
s.erase(std::remove_if(s.begin(), s.end(), [](char c) {
return c == '\n' || c == '\r';
}), s.end());
bool is_newline;
for (size_t i = s.find("<"); i != std::string::npos; i = s.find("<"))
{
size_t j = s.find(">", i);
if (j != std::string::npos)
{
++j;
is_p = s.compare(i, j-i, "<p ") == 0 || s.compare(i, j-i, "<p>") == 0;
is_slash_p = s.compare(i, j-i, "</p>") == 0;
if (is_p || is_slash_p)
is_newline
= s.compare(i, std::min<size_t>(3, j-i), "<p ") == 0
|| s.compare(i, j-i, "<p>") == 0
|| s.compare(i, j-i, "</p>") == 0
|| s.compare(i, j-i, "<br>") == 0
|| s.compare(i, j-i, "<br/>") == 0
|| s.compare(i, std::min<size_t>(4, j-i), "<br ") == 0;
if (is_newline)
s.replace(i, j-i, "\n");
else
s.replace(i, j-i, "");