Further improve HTML formatting
This commit is contained in:
@@ -106,6 +106,7 @@ std::vector<std::string> LyricsFetcher::getContent(const char *regex_, const std
|
|||||||
|
|
||||||
void LyricsFetcher::postProcess(std::string &data) const
|
void LyricsFetcher::postProcess(std::string &data) const
|
||||||
{
|
{
|
||||||
|
data = unescapeHtmlUtf8(data);
|
||||||
stripHtmlTags(data);
|
stripHtmlTags(data);
|
||||||
// Remove indentation from each line and collapse multiple newlines into one.
|
// Remove indentation from each line and collapse multiple newlines into one.
|
||||||
std::vector<std::string> lines;
|
std::vector<std::string> lines;
|
||||||
@@ -157,7 +158,6 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
|
|||||||
data.clear();
|
data.clear();
|
||||||
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||||
{
|
{
|
||||||
boost::replace_all(*it, "<br />", "\n");
|
|
||||||
stripHtmlTags(*it);
|
stripHtmlTags(*it);
|
||||||
boost::trim(*it);
|
boost::trim(*it);
|
||||||
if (!it->empty())
|
if (!it->empty())
|
||||||
@@ -226,34 +226,6 @@ bool GoogleLyricsFetcher::isURLOk(const std::string &url)
|
|||||||
|
|
||||||
/**********************************************************************/
|
/**********************************************************************/
|
||||||
|
|
||||||
void Sing365Fetcher::postProcess(std::string &data) const
|
|
||||||
{
|
|
||||||
// throw away ad
|
|
||||||
data = boost::regex_replace(data, boost::regex("<div.*</div>"), "");
|
|
||||||
LyricsFetcher::postProcess(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**********************************************************************/
|
|
||||||
|
|
||||||
void JustSomeLyricsFetcher::postProcess(std::string &data) const
|
|
||||||
{
|
|
||||||
data = unescapeHtmlUtf8(data);
|
|
||||||
LyricsFetcher::postProcess(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**********************************************************************/
|
|
||||||
|
|
||||||
void MetrolyricsFetcher::postProcess(std::string &data) const
|
|
||||||
{
|
|
||||||
// some of lyrics have both \n chars and <br />, html tags
|
|
||||||
// are always present whereas \n chars are not, so we need to
|
|
||||||
// throw them away to avoid having line breaks doubled.
|
|
||||||
boost::replace_all(data, " ", "");
|
|
||||||
boost::replace_all(data, "<br />", "\n");
|
|
||||||
data = unescapeHtmlUtf8(data);
|
|
||||||
LyricsFetcher::postProcess(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool MetrolyricsFetcher::isURLOk(const std::string &url)
|
bool MetrolyricsFetcher::isURLOk(const std::string &url)
|
||||||
{
|
{
|
||||||
// it sometimes return link to sitemap.xml, which is huge so we need to discard it
|
// it sometimes return link to sitemap.xml, which is huge so we need to discard it
|
||||||
|
|||||||
@@ -82,7 +82,6 @@ protected:
|
|||||||
virtual const char *regex() const OVERRIDE { return "<div class=\"lyrics-body\">(.*?)</div>"; }
|
virtual const char *regex() const OVERRIDE { return "<div class=\"lyrics-body\">(.*?)</div>"; }
|
||||||
|
|
||||||
virtual bool isURLOk(const std::string &url) OVERRIDE;
|
virtual bool isURLOk(const std::string &url) OVERRIDE;
|
||||||
virtual void postProcess(std::string &data) const OVERRIDE;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LyricsmaniaFetcher : public GoogleLyricsFetcher
|
struct LyricsmaniaFetcher : public GoogleLyricsFetcher
|
||||||
@@ -99,8 +98,6 @@ struct Sing365Fetcher : public GoogleLyricsFetcher
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *regex() const OVERRIDE { return "<!-Lyrics Begin->(.*?)<!-Lyrics End->"; }
|
virtual const char *regex() const OVERRIDE { return "<!-Lyrics Begin->(.*?)<!-Lyrics End->"; }
|
||||||
|
|
||||||
virtual void postProcess(std::string &data) const OVERRIDE;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
||||||
@@ -109,8 +106,6 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
|||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *regex() const OVERRIDE { return "<div class=\"content.*?</div>\\s*</div>(.*?)<div"; }
|
virtual const char *regex() const OVERRIDE { return "<div class=\"content.*?</div>\\s*</div>(.*?)<div"; }
|
||||||
|
|
||||||
virtual void postProcess(std::string &data) const OVERRIDE;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct AzLyricsFetcher : public GoogleLyricsFetcher
|
struct AzLyricsFetcher : public GoogleLyricsFetcher
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <boost/algorithm/string/replace.hpp>
|
#include <boost/algorithm/string/replace.hpp>
|
||||||
#include "utility/html.h"
|
#include "utility/html.h"
|
||||||
|
|
||||||
@@ -58,20 +59,32 @@ void unescapeHtmlEntities(std::string &s)
|
|||||||
boost::replace_all(s, "<", "<");
|
boost::replace_all(s, "<", "<");
|
||||||
boost::replace_all(s, " ", " ");
|
boost::replace_all(s, " ", " ");
|
||||||
boost::replace_all(s, """, "\"");
|
boost::replace_all(s, """, "\"");
|
||||||
|
boost::replace_all(s, "–", "–");
|
||||||
|
boost::replace_all(s, "—", "—");
|
||||||
}
|
}
|
||||||
|
|
||||||
void stripHtmlTags(std::string &s)
|
void stripHtmlTags(std::string &s)
|
||||||
{
|
{
|
||||||
bool is_p, is_slash_p;
|
// Erase newlines so they don't duplicate with HTML ones.
|
||||||
|
s.erase(std::remove_if(s.begin(), s.end(), [](char c) {
|
||||||
|
return c == '\n' || c == '\r';
|
||||||
|
}), s.end());
|
||||||
|
|
||||||
|
bool is_newline;
|
||||||
for (size_t i = s.find("<"); i != std::string::npos; i = s.find("<"))
|
for (size_t i = s.find("<"); i != std::string::npos; i = s.find("<"))
|
||||||
{
|
{
|
||||||
size_t j = s.find(">", i);
|
size_t j = s.find(">", i);
|
||||||
if (j != std::string::npos)
|
if (j != std::string::npos)
|
||||||
{
|
{
|
||||||
++j;
|
++j;
|
||||||
is_p = s.compare(i, j-i, "<p ") == 0 || s.compare(i, j-i, "<p>") == 0;
|
is_newline
|
||||||
is_slash_p = s.compare(i, j-i, "</p>") == 0;
|
= s.compare(i, std::min<size_t>(3, j-i), "<p ") == 0
|
||||||
if (is_p || is_slash_p)
|
|| s.compare(i, j-i, "<p>") == 0
|
||||||
|
|| s.compare(i, j-i, "</p>") == 0
|
||||||
|
|| s.compare(i, j-i, "<br>") == 0
|
||||||
|
|| s.compare(i, j-i, "<br/>") == 0
|
||||||
|
|| s.compare(i, std::min<size_t>(4, j-i), "<br ") == 0;
|
||||||
|
if (is_newline)
|
||||||
s.replace(i, j-i, "\n");
|
s.replace(i, j-i, "\n");
|
||||||
else
|
else
|
||||||
s.replace(i, j-i, "");
|
s.replace(i, j-i, "");
|
||||||
|
|||||||
Reference in New Issue
Block a user