lyrics fetcher: throw away broken fetchers and fix/improve existing ones

This commit is contained in:
Andrzej Rybczak
2013-07-09 00:20:04 +02:00
parent af10ac883f
commit 6d6110a52b
2 changed files with 63 additions and 174 deletions

View File

@@ -18,6 +18,7 @@
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include "config.h"
#include "curl_handle.h"
#ifdef HAVE_CURL_CURL_H
@@ -26,6 +27,7 @@
#include <cstring>
#include <boost/algorithm/string/replace.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/regex.hpp>
#include "charset.h"
#include "lyrics_fetcher.h"
@@ -35,16 +37,10 @@
LyricsFetcher *lyricsPlugins[] =
{
new LyricwikiFetcher(),
new LyricsvipFetcher(),
new Sing365Fetcher(),
new LoloLyricsFetcher(),
new LyriczzFetcher(),
new SonglyricsFetcher(),
new LyricsmaniaFetcher(),
new LyricstimeFetcher(),
new MetrolyricsFetcher(),
new JustSomeLyricsFetcher(),
new LyrcComArFetcher(),
new InternetLyricsFetcher(),
0
};
@@ -56,7 +52,7 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
Result result;
result.first = false;
std::string url = getURL();
std::string url = this->url();
boost::replace_all(url, "%artist%", artist.c_str());
boost::replace_all(url, "%title%", title.c_str());
@@ -69,35 +65,40 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
return result;
}
bool parse_ok = getContent(getOpenTag(), getCloseTag(), data);
auto lyrics = getContent(regex(), data);
if (!parse_ok || notLyrics(data))
if (lyrics.empty() || notLyrics(data))
{
result.second = msgNotFound;
return result;
}
postProcess(data);
data.clear();
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
{
postProcess(*it);
if (!it->empty())
{
data += *it;
if (it != lyrics.end()-1)
data += "\n\n----------\n\n";
}
}
result.second = data;
result.first = true;
return result;
}
bool LyricsFetcher::getContent(const char *open_tag, const char *close_tag, std::string &data)
std::vector<std::string> LyricsFetcher::getContent(const char *regex, const std::string &data)
{
size_t a, b;
if ((a = data.find(open_tag)) != std::string::npos)
{
a += strlen(open_tag);
if ((b = data.find(close_tag, a)) != std::string::npos)
data = data.substr(a, b-a);
else
return false;
}
else
return false;
return true;
std::vector<std::string> result;
boost::regex rx(regex);
auto first = boost::sregex_iterator(data.begin(), data.end(), rx);
auto last = boost::sregex_iterator();
for (; first != last; ++first)
result.push_back(first->str(1));
return result;
}
void LyricsFetcher::postProcess(std::string &data)
@@ -124,23 +125,36 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
return result;
}
bool parse_ok = getContent("'17'/></a></div>", "<!--", data);
auto lyrics = getContent("<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--", data);
if (!parse_ok)
if (lyrics.empty())
{
result.second = msgNotFound;
return result;
}
data = unescapeHtmlUtf8(data);
if (data.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos)
std::transform(lyrics.begin(), lyrics.end(), lyrics.begin(), unescapeHtmlUtf8);
bool license_restriction = std::any_of(lyrics.begin(), lyrics.end(), [](const std::string &s) {
return s.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos;
});
if (license_restriction)
{
result.second = "Licence restriction";
return result;
}
boost::replace_all(data, "<br />", "\n");
stripHtmlTags(data);
boost::trim(data);
data.clear();
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
{
boost::replace_all(*it, "<br />", "\n");
stripHtmlTags(*it);
boost::trim(*it);
if (!it->empty())
{
data += *it;
if (it != lyrics.end()-1)
data += "\n\n----------\n\n";
}
}
result.second = data;
result.first = true;
@@ -163,8 +177,8 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
std::string search_str = artist;
search_str += "+";
search_str += title;
search_str += "+";
search_str += getSiteKeyword();
search_str += "+%2B";
search_str += siteKeyword();
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
google_url += search_str;
@@ -179,15 +193,15 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
return result;
}
bool found_url = getContent("<A HREF=\"", "\">here</A>", data);
auto urls = getContent("<A HREF=\"(.*?)\">here</A>", data);
if (!found_url || !isURLOk(data))
if (urls.empty() || !isURLOk(urls[0]))
{
result.second = msgNotFound;
return result;
}
data = unescapeHtmlUtf8(data);
data = unescapeHtmlUtf8(urls[0]);
//result.second = data;
//return result;
@@ -197,25 +211,7 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
bool GoogleLyricsFetcher::isURLOk(const std::string &url)
{
return url.find(getSiteKeyword()) != std::string::npos;
}
/**********************************************************************/
bool LyricstimeFetcher::isURLOk(const std::string &url)
{
// it sometimes returns list of all artists that begin
// with a given letter, e.g. www.lyricstime.com/A.html, which
// is 25 chars long, so we want longer.
return GoogleLyricsFetcher::isURLOk(url) && url.length() > 25;
}
void LyricstimeFetcher::postProcess(std::string &data)
{
// lyricstime.com uses iso-8859-1 as the encoding
// so we need to convert obtained lyrics to utf-8
data = Charset::toUtf8From(data, "iso-8859-1");
LyricsFetcher::postProcess(data);
return url.find(siteKeyword()) != std::string::npos;
}
/**********************************************************************/
@@ -253,33 +249,6 @@ void LyricsmaniaFetcher::postProcess(std::string &data)
/**********************************************************************/
void SonglyricsFetcher::postProcess(std::string &data)
{
// throw away [ ... lyrics are found on www.songlyrics.com ] info.
// there is +2 instead of +1 in third line because there is extra
// space after ] we also want to get rid of
size_t i = data.find('['), j = data.find(']');
if (i != std::string::npos && i != std::string::npos)
data.replace(i, j-i+2, "");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
void LyricsvipFetcher::postProcess(std::string &data)
{
// throw away <div> with ad
size_t i = data.find("<div class=\"ad\""), j = data.find("</div>");
if (i != std::string::npos && i != std::string::npos)
data.replace(i, j-i+const_strlen("</div>"), "");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
{
GoogleLyricsFetcher::fetch(artist, title);

View File

@@ -35,37 +35,25 @@ struct LyricsFetcher
virtual Result fetch(const std::string &artist, const std::string &title);
protected:
virtual const char *getURL() = 0;
virtual const char *getOpenTag() = 0;
virtual const char *getCloseTag() = 0;
virtual const char *url() = 0;
virtual const char *regex() = 0;
virtual bool notLyrics(const std::string &) { return false; }
virtual void postProcess(std::string &data);
bool getContent(const char *open_tag, const char *close_tag, std::string &data);
std::vector<std::string> getContent(const char *regex, const std::string &data);
static const char msgNotFound[];
};
struct LyrcComArFetcher : public LyricsFetcher
{
virtual const char *name() { return "lyrc.com.ar"; }
protected:
virtual const char *getURL() { return "http://lyrc.com.ar/tema1es.php?artist=%artist%&songname=%title%"; }
virtual const char *getOpenTag() { return "</table>"; }
virtual const char *getCloseTag() { return "<p>"; }
};
struct LyricwikiFetcher : public LyricsFetcher
{
virtual const char *name() { return "lyricwiki.com"; }
virtual Result fetch(const std::string &artist, const std::string &title);
protected:
virtual const char *getURL() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
virtual const char *getOpenTag() { return "<url>"; }
virtual const char *getCloseTag() { return "</url>"; }
virtual const char *url() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
virtual const char *regex() { return "<url>(.*?)</url>"; }
virtual bool notLyrics(const std::string &data);
};
@@ -77,8 +65,8 @@ struct GoogleLyricsFetcher : public LyricsFetcher
virtual Result fetch(const std::string &artist, const std::string &title);
protected:
virtual const char *getSiteKeyword() = 0;
virtual const char *getURL() { return URL; }
virtual const char *url() { return URL; }
virtual const char *siteKeyword() { return name(); }
virtual bool isURLOk(const std::string &url);
@@ -86,31 +74,14 @@ private:
const char *URL;
};
struct LyricstimeFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lyricstime.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyricstime"; }
virtual const char *getOpenTag() { return "<div id=\"songlyrics\" >"; }
virtual const char *getCloseTag() { return "</div>"; }
virtual bool isURLOk(const std::string &url);
virtual void postProcess(std::string &data);
};
struct MetrolyricsFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "metrolyrics.com"; }
protected:
virtual const char *getSiteKeyword() { return "metrolyrics"; }
virtual const char *getOpenTag() { return "<div id=\"lyrics\">"; }
virtual const char *getCloseTag() { return "</div>"; }
virtual const char *regex() { return "<div id=\"lyrics-body\">(.*?)</div>"; }
virtual bool isURLOk(const std::string &url);
virtual void postProcess(std::string &data);
};
@@ -119,55 +90,17 @@ struct LyricsmaniaFetcher : public GoogleLyricsFetcher
virtual const char *name() { return "lyricsmania.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyricsmania"; }
virtual const char *getOpenTag() { return "</strong> :<br />"; }
virtual const char *getCloseTag() { return "&#91; <a"; }
virtual const char *regex() { return "<div id='songlyrics_h' class='dn'>(.*?)</div>"; }
virtual void postProcess(std::string &data);
};
struct SonglyricsFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "songlyrics.com"; }
protected:
virtual const char *getSiteKeyword() { return "songlyrics"; }
virtual const char *getOpenTag() { return "-6000px;\">"; }
virtual const char *getCloseTag() { return "</p>"; }
virtual void postProcess(std::string &data);
};
struct LyriczzFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lyriczz.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyriczz"; }
virtual const char *getOpenTag() { return "border=0 /></a>"; }
virtual const char *getCloseTag() { return "<a href"; }
};
struct Sing365Fetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "sing365.com"; }
protected:
virtual const char *getSiteKeyword() { return "sing365"; }
virtual const char *getOpenTag() { return "<br><br></div>"; }
virtual const char *getCloseTag() { return "<div align"; }
};
struct LyricsvipFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lyricsvip.com"; }
protected:
virtual const char *getSiteKeyword() { return "lyricsvip"; }
virtual const char *getOpenTag() { return "</h2>"; }
virtual const char *getCloseTag() { return "</td>"; }
virtual void postProcess(std::string &data);
virtual const char *regex() { return "<div style=\"font-size: 14px;\">(.*?)</div>"; }
};
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
@@ -175,19 +108,7 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
virtual const char *name() { return "justsomelyrics.com"; }
protected:
virtual const char *getSiteKeyword() { return "justsomelyrics"; }
virtual const char *getOpenTag() { return "alt=\"phone\" />\n</div>"; }
virtual const char *getCloseTag() { return "<div class=\"adsdiv\">"; }
};
struct LoloLyricsFetcher : public GoogleLyricsFetcher
{
virtual const char *name() { return "lololyrics.com"; }
protected:
virtual const char *getSiteKeyword() { return "lololyrics"; }
virtual const char *getOpenTag() { return "<div class=\"lyrics_txt\" id=\"lyrics_txt\" style=\"font-size:12px; letter-spacing:0.2px; line-height:20px;\">"; }
virtual const char *getCloseTag() { return "</div>"; }
virtual const char *regex() { return "<p class=\"lyrics\">(.*?)</p>"; }
};
struct InternetLyricsFetcher : public GoogleLyricsFetcher
@@ -196,9 +117,8 @@ struct InternetLyricsFetcher : public GoogleLyricsFetcher
virtual Result fetch(const std::string &artist, const std::string &title);
protected:
virtual const char *getSiteKeyword() { return "lyrics"; }
virtual const char *getOpenTag() { return ""; }
virtual const char *getCloseTag() { return ""; }
virtual const char *siteKeyword() { return "lyrics"; }
virtual const char *regex() { return ""; }
virtual bool isURLOk(const std::string &url);