Further improve HTML formatting
This commit is contained in:
@@ -106,6 +106,7 @@ std::vector<std::string> LyricsFetcher::getContent(const char *regex_, const std
|
||||
|
||||
void LyricsFetcher::postProcess(std::string &data) const
|
||||
{
|
||||
data = unescapeHtmlUtf8(data);
|
||||
stripHtmlTags(data);
|
||||
// Remove indentation from each line and collapse multiple newlines into one.
|
||||
std::vector<std::string> lines;
|
||||
@@ -157,7 +158,6 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
|
||||
data.clear();
|
||||
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||
{
|
||||
boost::replace_all(*it, "<br />", "\n");
|
||||
stripHtmlTags(*it);
|
||||
boost::trim(*it);
|
||||
if (!it->empty())
|
||||
@@ -226,34 +226,6 @@ bool GoogleLyricsFetcher::isURLOk(const std::string &url)
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void Sing365Fetcher::postProcess(std::string &data) const
|
||||
{
|
||||
// throw away ad
|
||||
data = boost::regex_replace(data, boost::regex("<div.*</div>"), "");
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void JustSomeLyricsFetcher::postProcess(std::string &data) const
|
||||
{
|
||||
data = unescapeHtmlUtf8(data);
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void MetrolyricsFetcher::postProcess(std::string &data) const
|
||||
{
|
||||
// some of lyrics have both \n chars and <br />, html tags
|
||||
// are always present whereas \n chars are not, so we need to
|
||||
// throw them away to avoid having line breaks doubled.
|
||||
boost::replace_all(data, " ", "");
|
||||
boost::replace_all(data, "<br />", "\n");
|
||||
data = unescapeHtmlUtf8(data);
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
bool MetrolyricsFetcher::isURLOk(const std::string &url)
|
||||
{
|
||||
// it sometimes return link to sitemap.xml, which is huge so we need to discard it
|
||||
|
||||
@@ -82,7 +82,6 @@ protected:
|
||||
virtual const char *regex() const OVERRIDE { return "<div class=\"lyrics-body\">(.*?)</div>"; }
|
||||
|
||||
virtual bool isURLOk(const std::string &url) OVERRIDE;
|
||||
virtual void postProcess(std::string &data) const OVERRIDE;
|
||||
};
|
||||
|
||||
struct LyricsmaniaFetcher : public GoogleLyricsFetcher
|
||||
@@ -99,8 +98,6 @@ struct Sing365Fetcher : public GoogleLyricsFetcher
|
||||
|
||||
protected:
|
||||
virtual const char *regex() const OVERRIDE { return "<!-Lyrics Begin->(.*?)<!-Lyrics End->"; }
|
||||
|
||||
virtual void postProcess(std::string &data) const OVERRIDE;
|
||||
};
|
||||
|
||||
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
||||
@@ -109,8 +106,6 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
||||
|
||||
protected:
|
||||
virtual const char *regex() const OVERRIDE { return "<div class=\"content.*?</div>\\s*</div>(.*?)<div"; }
|
||||
|
||||
virtual void postProcess(std::string &data) const OVERRIDE;
|
||||
};
|
||||
|
||||
struct AzLyricsFetcher : public GoogleLyricsFetcher
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||||
***************************************************************************/
|
||||
|
||||
#include <algorithm>
|
||||
#include <boost/algorithm/string/replace.hpp>
|
||||
#include "utility/html.h"
|
||||
|
||||
@@ -58,20 +59,32 @@ void unescapeHtmlEntities(std::string &s)
|
||||
boost::replace_all(s, "<", "<");
|
||||
boost::replace_all(s, " ", " ");
|
||||
boost::replace_all(s, """, "\"");
|
||||
boost::replace_all(s, "–", "–");
|
||||
boost::replace_all(s, "—", "—");
|
||||
}
|
||||
|
||||
void stripHtmlTags(std::string &s)
|
||||
{
|
||||
bool is_p, is_slash_p;
|
||||
// Erase newlines so they don't duplicate with HTML ones.
|
||||
s.erase(std::remove_if(s.begin(), s.end(), [](char c) {
|
||||
return c == '\n' || c == '\r';
|
||||
}), s.end());
|
||||
|
||||
bool is_newline;
|
||||
for (size_t i = s.find("<"); i != std::string::npos; i = s.find("<"))
|
||||
{
|
||||
size_t j = s.find(">", i);
|
||||
if (j != std::string::npos)
|
||||
{
|
||||
++j;
|
||||
is_p = s.compare(i, j-i, "<p ") == 0 || s.compare(i, j-i, "<p>") == 0;
|
||||
is_slash_p = s.compare(i, j-i, "</p>") == 0;
|
||||
if (is_p || is_slash_p)
|
||||
is_newline
|
||||
= s.compare(i, std::min<size_t>(3, j-i), "<p ") == 0
|
||||
|| s.compare(i, j-i, "<p>") == 0
|
||||
|| s.compare(i, j-i, "</p>") == 0
|
||||
|| s.compare(i, j-i, "<br>") == 0
|
||||
|| s.compare(i, j-i, "<br/>") == 0
|
||||
|| s.compare(i, std::min<size_t>(4, j-i), "<br ") == 0;
|
||||
if (is_newline)
|
||||
s.replace(i, j-i, "\n");
|
||||
else
|
||||
s.replace(i, j-i, "");
|
||||
|
||||
Reference in New Issue
Block a user