lyrics fetcher: throw away broken fetchers and fix/improve existing ones

This commit is contained in:
Andrzej Rybczak
2013-07-09 00:20:04 +02:00
parent af10ac883f
commit 6d6110a52b
2 changed files with 63 additions and 174 deletions

View File

@@ -18,6 +18,7 @@
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. * * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/ ***************************************************************************/
#include "config.h"
#include "curl_handle.h" #include "curl_handle.h"
#ifdef HAVE_CURL_CURL_H #ifdef HAVE_CURL_CURL_H
@@ -26,6 +27,7 @@
#include <cstring> #include <cstring>
#include <boost/algorithm/string/replace.hpp> #include <boost/algorithm/string/replace.hpp>
#include <boost/algorithm/string/trim.hpp> #include <boost/algorithm/string/trim.hpp>
#include <boost/regex.hpp>
#include "charset.h" #include "charset.h"
#include "lyrics_fetcher.h" #include "lyrics_fetcher.h"
@@ -35,16 +37,10 @@
LyricsFetcher *lyricsPlugins[] = LyricsFetcher *lyricsPlugins[] =
{ {
new LyricwikiFetcher(), new LyricwikiFetcher(),
new LyricsvipFetcher(),
new Sing365Fetcher(), new Sing365Fetcher(),
new LoloLyricsFetcher(),
new LyriczzFetcher(),
new SonglyricsFetcher(),
new LyricsmaniaFetcher(), new LyricsmaniaFetcher(),
new LyricstimeFetcher(),
new MetrolyricsFetcher(), new MetrolyricsFetcher(),
new JustSomeLyricsFetcher(), new JustSomeLyricsFetcher(),
new LyrcComArFetcher(),
new InternetLyricsFetcher(), new InternetLyricsFetcher(),
0 0
}; };
@@ -56,7 +52,7 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
Result result; Result result;
result.first = false; result.first = false;
std::string url = getURL(); std::string url = this->url();
boost::replace_all(url, "%artist%", artist.c_str()); boost::replace_all(url, "%artist%", artist.c_str());
boost::replace_all(url, "%title%", title.c_str()); boost::replace_all(url, "%title%", title.c_str());
@@ -69,35 +65,40 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
return result; return result;
} }
bool parse_ok = getContent(getOpenTag(), getCloseTag(), data); auto lyrics = getContent(regex(), data);
if (!parse_ok || notLyrics(data)) if (lyrics.empty() || notLyrics(data))
{ {
result.second = msgNotFound; result.second = msgNotFound;
return result; return result;
} }
postProcess(data); data.clear();
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
{
postProcess(*it);
if (!it->empty())
{
data += *it;
if (it != lyrics.end()-1)
data += "\n\n----------\n\n";
}
}
result.second = data; result.second = data;
result.first = true; result.first = true;
return result; return result;
} }
bool LyricsFetcher::getContent(const char *open_tag, const char *close_tag, std::string &data) std::vector<std::string> LyricsFetcher::getContent(const char *regex, const std::string &data)
{ {
size_t a, b; std::vector<std::string> result;
if ((a = data.find(open_tag)) != std::string::npos) boost::regex rx(regex);
{ auto first = boost::sregex_iterator(data.begin(), data.end(), rx);
a += strlen(open_tag); auto last = boost::sregex_iterator();
if ((b = data.find(close_tag, a)) != std::string::npos) for (; first != last; ++first)
data = data.substr(a, b-a); result.push_back(first->str(1));
else return result;
return false;
}
else
return false;
return true;
} }
void LyricsFetcher::postProcess(std::string &data) void LyricsFetcher::postProcess(std::string &data)
@@ -124,23 +125,36 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
return result; return result;
} }
bool parse_ok = getContent("'17'/></a></div>", "<!--", data); auto lyrics = getContent("<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--", data);
if (!parse_ok) if (lyrics.empty())
{ {
result.second = msgNotFound; result.second = msgNotFound;
return result; return result;
} }
data = unescapeHtmlUtf8(data); std::transform(lyrics.begin(), lyrics.end(), lyrics.begin(), unescapeHtmlUtf8);
if (data.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos) bool license_restriction = std::any_of(lyrics.begin(), lyrics.end(), [](const std::string &s) {
return s.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos;
});
if (license_restriction)
{ {
result.second = "Licence restriction"; result.second = "Licence restriction";
return result; return result;
} }
boost::replace_all(data, "<br />", "\n"); data.clear();
stripHtmlTags(data); for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
boost::trim(data); {
boost::replace_all(*it, "<br />", "\n");
stripHtmlTags(*it);
boost::trim(*it);
if (!it->empty())
{
data += *it;
if (it != lyrics.end()-1)
data += "\n\n----------\n\n";
}
}
result.second = data; result.second = data;
result.first = true; result.first = true;
@@ -163,8 +177,8 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
std::string search_str = artist; std::string search_str = artist;
search_str += "+"; search_str += "+";
search_str += title; search_str += title;
search_str += "+"; search_str += "+%2B";
search_str += getSiteKeyword(); search_str += siteKeyword();
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q="; std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
google_url += search_str; google_url += search_str;
@@ -179,15 +193,15 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
return result; return result;
} }
bool found_url = getContent("<A HREF=\"", "\">here</A>", data); auto urls = getContent("<A HREF=\"(.*?)\">here</A>", data);
if (!found_url || !isURLOk(data)) if (urls.empty() || !isURLOk(urls[0]))
{ {
result.second = msgNotFound; result.second = msgNotFound;
return result; return result;
} }
data = unescapeHtmlUtf8(data); data = unescapeHtmlUtf8(urls[0]);
//result.second = data; //result.second = data;
//return result; //return result;
@@ -197,25 +211,7 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
bool GoogleLyricsFetcher::isURLOk(const std::string &url) bool GoogleLyricsFetcher::isURLOk(const std::string &url)
{ {
return url.find(getSiteKeyword()) != std::string::npos; return url.find(siteKeyword()) != std::string::npos;
}
/**********************************************************************/
bool LyricstimeFetcher::isURLOk(const std::string &url)
{
// it sometimes returns list of all artists that begin
// with a given letter, e.g. www.lyricstime.com/A.html, which
// is 25 chars long, so we want longer.
return GoogleLyricsFetcher::isURLOk(url) && url.length() > 25;
}
void LyricstimeFetcher::postProcess(std::string &data)
{
// lyricstime.com uses iso-8859-1 as the encoding
// so we need to convert obtained lyrics to utf-8
data = Charset::toUtf8From(data, "iso-8859-1");
LyricsFetcher::postProcess(data);
} }
/**********************************************************************/ /**********************************************************************/
@@ -253,33 +249,6 @@ void LyricsmaniaFetcher::postProcess(std::string &data)
/**********************************************************************/ /**********************************************************************/
void SonglyricsFetcher::postProcess(std::string &data)
{
// throw away [ ... lyrics are found on www.songlyrics.com ] info.
// there is +2 instead of +1 in third line because there is extra
// space after ] we also want to get rid of
size_t i = data.find('['), j = data.find(']');
if (i != std::string::npos && i != std::string::npos)
data.replace(i, j-i+2, "");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
void LyricsvipFetcher::postProcess(std::string &data)
{
// throw away <div> with ad
size_t i = data.find("<div class=\"ad\""), j = data.find("</div>");
if (i != std::string::npos && i != std::string::npos)
data.replace(i, j-i+const_strlen("</div>"), "");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title) LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
{ {
GoogleLyricsFetcher::fetch(artist, title); GoogleLyricsFetcher::fetch(artist, title);

View File

@@ -35,37 +35,25 @@ struct LyricsFetcher
virtual Result fetch(const std::string &artist, const std::string &title); virtual Result fetch(const std::string &artist, const std::string &title);
protected: protected:
virtual const char *getURL() = 0; virtual const char *url() = 0;
virtual const char *getOpenTag() = 0; virtual const char *regex() = 0;
virtual const char *getCloseTag() = 0;
virtual bool notLyrics(const std::string &) { return false; } virtual bool notLyrics(const std::string &) { return false; }
virtual void postProcess(std::string &data); virtual void postProcess(std::string &data);
bool getContent(const char *open_tag, const char *close_tag, std::string &data); std::vector<std::string> getContent(const char *regex, const std::string &data);
static const char msgNotFound[]; static const char msgNotFound[];
}; };
struct LyrcComArFetcher : public LyricsFetcher
{
virtual const char *name() { return "lyrc.com.ar"; }
protected:
virtual const char *getURL() { return "http://lyrc.com.ar/tema1es.php?artist=%artist%&songname=%title%"; }
virtual const char *getOpenTag() { return "</table>"; }
virtual const char *getCloseTag() { return "<p>"; }
};
struct LyricwikiFetcher : public LyricsFetcher struct LyricwikiFetcher : public LyricsFetcher
{ {
virtual const char *name() { return "lyricwiki.com"; } virtual const char *name() { return "lyricwiki.com"; }
virtual Result fetch(const std::string &artist, const std::string &title); virtual Result fetch(const std::string &artist, const std::string &title);
protected: protected:
virtual const char *getURL() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; } virtual const char *url() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
virtual const char *getOpenTag() { return "<url>"; } virtual const char *regex() { return "<url>(.*?)</url>"; }
virtual const char *getCloseTag() { return "</url>"; }
virtual bool notLyrics(const std::string &data); virtual bool notLyrics(const std::string &data);
}; };
@@ -77,8 +65,8 @@ struct GoogleLyricsFetcher : public LyricsFetcher
virtual Result fetch(const std::string &artist, const std::string &title); virtual Result fetch(const std::string &artist, const std::string &title);
protected: protected:
virtual const char *getSiteKeyword() = 0; virtual const char *url() { return URL; }
virtual const char *getURL() { return URL; } virtual const char *siteKeyword() { return name(); }
virtual bool isURLOk(const std::string &url); virtual bool isURLOk(const std::string &url);
@@ -86,31 +74,14 @@ private:
const char *URL; const char *URL;
}; };
struct LyricstimeFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lyricstime.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyricstime"; }
virtual const char *getOpenTag() { return "<div id=\"songlyrics\" >"; }
virtual const char *getCloseTag() { return "</div>"; }
virtual bool isURLOk(const std::string &url);
virtual void postProcess(std::string &data);
};
struct MetrolyricsFetcher : public GoogleLyricsFetcher struct MetrolyricsFetcher : public GoogleLyricsFetcher
{ {
virtual const char *name() { return "metrolyrics.com"; } virtual const char *name() { return "metrolyrics.com"; }
protected: protected:
virtual const char *getSiteKeyword() { return "metrolyrics"; } virtual const char *regex() { return "<div id=\"lyrics-body\">(.*?)</div>"; }
virtual const char *getOpenTag() { return "<div id=\"lyrics\">"; }
virtual const char *getCloseTag() { return "</div>"; }
virtual bool isURLOk(const std::string &url); virtual bool isURLOk(const std::string &url);
virtual void postProcess(std::string &data); virtual void postProcess(std::string &data);
}; };
@@ -119,55 +90,17 @@ struct LyricsmaniaFetcher : public GoogleLyricsFetcher
virtual const char *name() { return "lyricsmania.com"; } virtual const char *name() { return "lyricsmania.com"; }
protected: protected:
virtual const char *getSiteKeyword() { return "lyricsmania"; } virtual const char *regex() { return "<div id='songlyrics_h' class='dn'>(.*?)</div>"; }
virtual const char *getOpenTag() { return "</strong> :<br />"; }
virtual const char *getCloseTag() { return "&#91; <a"; }
virtual void postProcess(std::string &data); virtual void postProcess(std::string &data);
}; };
struct SonglyricsFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "songlyrics.com"; }
protected:
virtual const char *getSiteKeyword() { return "songlyrics"; }
virtual const char *getOpenTag() { return "-6000px;\">"; }
virtual const char *getCloseTag() { return "</p>"; }
virtual void postProcess(std::string &data);
};
struct LyriczzFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lyriczz.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyriczz"; }
virtual const char *getOpenTag() { return "border=0 /></a>"; }
virtual const char *getCloseTag() { return "<a href"; }
};
struct Sing365Fetcher : public GoogleLyricsFetcher struct Sing365Fetcher : public GoogleLyricsFetcher
{ {
virtual const char *name() { return "sing365.com"; } virtual const char *name() { return "sing365.com"; }
protected: protected:
virtual const char *getSiteKeyword() { return "sing365"; } virtual const char *regex() { return "<div style=\"font-size: 14px;\">(.*?)</div>"; }
virtual const char *getOpenTag() { return "<br><br></div>"; }
virtual const char *getCloseTag() { return "<div align"; }
};
struct LyricsvipFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lyricsvip.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyricsvip"; }
virtual const char *getOpenTag() { return "</h2>"; }
virtual const char *getCloseTag() { return "</td>"; }
virtual void postProcess(std::string &data);
}; };
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
@@ -175,19 +108,7 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
virtual const char *name() { return "justsomelyrics.com"; } virtual const char *name() { return "justsomelyrics.com"; }
protected: protected:
virtual const char *getSiteKeyword() { return "justsomelyrics"; } virtual const char *regex() { return "<p class=\"lyrics\">(.*?)</p>"; }
virtual const char *getOpenTag() { return "alt=\"phone\" />\n</div>"; }
virtual const char *getCloseTag() { return "<div class=\"adsdiv\">"; }
};
struct LoloLyricsFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lololyrics.com"; }
protected:
virtual const char *getSiteKeyword() { return "lololyrics"; }
virtual const char *getOpenTag() { return "<div class=\"lyrics_txt\" id=\"lyrics_txt\" style=\"font-size:12px; letter-spacing:0.2px; line-height:20px;\">"; }
virtual const char *getCloseTag() { return "</div>"; }
}; };
struct InternetLyricsFetcher : public GoogleLyricsFetcher struct InternetLyricsFetcher : public GoogleLyricsFetcher
@@ -196,9 +117,8 @@ struct InternetLyricsFetcher : public GoogleLyricsFetcher
virtual Result fetch(const std::string &artist, const std::string &title); virtual Result fetch(const std::string &artist, const std::string &title);
protected: protected:
virtual const char *getSiteKeyword() { return "lyrics"; } virtual const char *siteKeyword() { return "lyrics"; }
virtual const char *getOpenTag() { return ""; } virtual const char *regex() { return ""; }
virtual const char *getCloseTag() { return ""; }
virtual bool isURLOk(const std::string &url); virtual bool isURLOk(const std::string &url);