lyrics fetcher: throw away broken fetchers and fix/improve existing ones
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||||
***************************************************************************/
|
||||
|
||||
#include "config.h"
|
||||
#include "curl_handle.h"
|
||||
|
||||
#ifdef HAVE_CURL_CURL_H
|
||||
@@ -26,6 +27,7 @@
|
||||
#include <cstring>
|
||||
#include <boost/algorithm/string/replace.hpp>
|
||||
#include <boost/algorithm/string/trim.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
|
||||
#include "charset.h"
|
||||
#include "lyrics_fetcher.h"
|
||||
@@ -35,16 +37,10 @@
|
||||
LyricsFetcher *lyricsPlugins[] =
|
||||
{
|
||||
new LyricwikiFetcher(),
|
||||
new LyricsvipFetcher(),
|
||||
new Sing365Fetcher(),
|
||||
new LoloLyricsFetcher(),
|
||||
new LyriczzFetcher(),
|
||||
new SonglyricsFetcher(),
|
||||
new LyricsmaniaFetcher(),
|
||||
new LyricstimeFetcher(),
|
||||
new MetrolyricsFetcher(),
|
||||
new JustSomeLyricsFetcher(),
|
||||
new LyrcComArFetcher(),
|
||||
new InternetLyricsFetcher(),
|
||||
0
|
||||
};
|
||||
@@ -56,7 +52,7 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
|
||||
Result result;
|
||||
result.first = false;
|
||||
|
||||
std::string url = getURL();
|
||||
std::string url = this->url();
|
||||
boost::replace_all(url, "%artist%", artist.c_str());
|
||||
boost::replace_all(url, "%title%", title.c_str());
|
||||
|
||||
@@ -69,35 +65,40 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
|
||||
return result;
|
||||
}
|
||||
|
||||
bool parse_ok = getContent(getOpenTag(), getCloseTag(), data);
|
||||
auto lyrics = getContent(regex(), data);
|
||||
|
||||
if (!parse_ok || notLyrics(data))
|
||||
if (lyrics.empty() || notLyrics(data))
|
||||
{
|
||||
result.second = msgNotFound;
|
||||
return result;
|
||||
}
|
||||
|
||||
postProcess(data);
|
||||
data.clear();
|
||||
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||
{
|
||||
postProcess(*it);
|
||||
if (!it->empty())
|
||||
{
|
||||
data += *it;
|
||||
if (it != lyrics.end()-1)
|
||||
data += "\n\n----------\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
result.second = data;
|
||||
result.first = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
bool LyricsFetcher::getContent(const char *open_tag, const char *close_tag, std::string &data)
|
||||
std::vector<std::string> LyricsFetcher::getContent(const char *regex, const std::string &data)
|
||||
{
|
||||
size_t a, b;
|
||||
if ((a = data.find(open_tag)) != std::string::npos)
|
||||
{
|
||||
a += strlen(open_tag);
|
||||
if ((b = data.find(close_tag, a)) != std::string::npos)
|
||||
data = data.substr(a, b-a);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
return true;
|
||||
std::vector<std::string> result;
|
||||
boost::regex rx(regex);
|
||||
auto first = boost::sregex_iterator(data.begin(), data.end(), rx);
|
||||
auto last = boost::sregex_iterator();
|
||||
for (; first != last; ++first)
|
||||
result.push_back(first->str(1));
|
||||
return result;
|
||||
}
|
||||
|
||||
void LyricsFetcher::postProcess(std::string &data)
|
||||
@@ -124,23 +125,36 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
|
||||
return result;
|
||||
}
|
||||
|
||||
bool parse_ok = getContent("'17'/></a></div>", "<!--", data);
|
||||
auto lyrics = getContent("<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--", data);
|
||||
|
||||
if (!parse_ok)
|
||||
if (lyrics.empty())
|
||||
{
|
||||
result.second = msgNotFound;
|
||||
return result;
|
||||
}
|
||||
data = unescapeHtmlUtf8(data);
|
||||
if (data.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos)
|
||||
std::transform(lyrics.begin(), lyrics.end(), lyrics.begin(), unescapeHtmlUtf8);
|
||||
bool license_restriction = std::any_of(lyrics.begin(), lyrics.end(), [](const std::string &s) {
|
||||
return s.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos;
|
||||
});
|
||||
if (license_restriction)
|
||||
{
|
||||
result.second = "Licence restriction";
|
||||
return result;
|
||||
}
|
||||
|
||||
boost::replace_all(data, "<br />", "\n");
|
||||
stripHtmlTags(data);
|
||||
boost::trim(data);
|
||||
data.clear();
|
||||
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||
{
|
||||
boost::replace_all(*it, "<br />", "\n");
|
||||
stripHtmlTags(*it);
|
||||
boost::trim(*it);
|
||||
if (!it->empty())
|
||||
{
|
||||
data += *it;
|
||||
if (it != lyrics.end()-1)
|
||||
data += "\n\n----------\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
result.second = data;
|
||||
result.first = true;
|
||||
@@ -163,8 +177,8 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
||||
std::string search_str = artist;
|
||||
search_str += "+";
|
||||
search_str += title;
|
||||
search_str += "+";
|
||||
search_str += getSiteKeyword();
|
||||
search_str += "+%2B";
|
||||
search_str += siteKeyword();
|
||||
|
||||
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
|
||||
google_url += search_str;
|
||||
@@ -179,15 +193,15 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
||||
return result;
|
||||
}
|
||||
|
||||
bool found_url = getContent("<A HREF=\"", "\">here</A>", data);
|
||||
auto urls = getContent("<A HREF=\"(.*?)\">here</A>", data);
|
||||
|
||||
if (!found_url || !isURLOk(data))
|
||||
if (urls.empty() || !isURLOk(urls[0]))
|
||||
{
|
||||
result.second = msgNotFound;
|
||||
return result;
|
||||
}
|
||||
|
||||
data = unescapeHtmlUtf8(data);
|
||||
data = unescapeHtmlUtf8(urls[0]);
|
||||
//result.second = data;
|
||||
//return result;
|
||||
|
||||
@@ -197,25 +211,7 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
||||
|
||||
bool GoogleLyricsFetcher::isURLOk(const std::string &url)
|
||||
{
|
||||
return url.find(getSiteKeyword()) != std::string::npos;
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
bool LyricstimeFetcher::isURLOk(const std::string &url)
|
||||
{
|
||||
// it sometimes returns list of all artists that begin
|
||||
// with a given letter, e.g. www.lyricstime.com/A.html, which
|
||||
// is 25 chars long, so we want longer.
|
||||
return GoogleLyricsFetcher::isURLOk(url) && url.length() > 25;
|
||||
}
|
||||
|
||||
void LyricstimeFetcher::postProcess(std::string &data)
|
||||
{
|
||||
// lyricstime.com uses iso-8859-1 as the encoding
|
||||
// so we need to convert obtained lyrics to utf-8
|
||||
data = Charset::toUtf8From(data, "iso-8859-1");
|
||||
LyricsFetcher::postProcess(data);
|
||||
return url.find(siteKeyword()) != std::string::npos;
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
@@ -253,33 +249,6 @@ void LyricsmaniaFetcher::postProcess(std::string &data)
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void SonglyricsFetcher::postProcess(std::string &data)
|
||||
{
|
||||
// throw away [ ... lyrics are found on www.songlyrics.com ] info.
|
||||
// there is +2 instead of +1 in third line because there is extra
|
||||
// space after ] we also want to get rid of
|
||||
size_t i = data.find('['), j = data.find(']');
|
||||
if (i != std::string::npos && i != std::string::npos)
|
||||
data.replace(i, j-i+2, "");
|
||||
data = unescapeHtmlUtf8(data);
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void LyricsvipFetcher::postProcess(std::string &data)
|
||||
{
|
||||
// throw away <div> with ad
|
||||
size_t i = data.find("<div class=\"ad\""), j = data.find("</div>");
|
||||
if (i != std::string::npos && i != std::string::npos)
|
||||
data.replace(i, j-i+const_strlen("</div>"), "");
|
||||
data = unescapeHtmlUtf8(data);
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
|
||||
{
|
||||
GoogleLyricsFetcher::fetch(artist, title);
|
||||
|
||||
@@ -35,37 +35,25 @@ struct LyricsFetcher
|
||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||
|
||||
protected:
|
||||
virtual const char *getURL() = 0;
|
||||
virtual const char *getOpenTag() = 0;
|
||||
virtual const char *getCloseTag() = 0;
|
||||
virtual const char *url() = 0;
|
||||
virtual const char *regex() = 0;
|
||||
|
||||
virtual bool notLyrics(const std::string &) { return false; }
|
||||
virtual void postProcess(std::string &data);
|
||||
|
||||
bool getContent(const char *open_tag, const char *close_tag, std::string &data);
|
||||
std::vector<std::string> getContent(const char *regex, const std::string &data);
|
||||
|
||||
static const char msgNotFound[];
|
||||
};
|
||||
|
||||
struct LyrcComArFetcher : public LyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "lyrc.com.ar"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getURL() { return "http://lyrc.com.ar/tema1es.php?artist=%artist%&songname=%title%"; }
|
||||
virtual const char *getOpenTag() { return "</table>"; }
|
||||
virtual const char *getCloseTag() { return "<p>"; }
|
||||
};
|
||||
|
||||
struct LyricwikiFetcher : public LyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "lyricwiki.com"; }
|
||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||
|
||||
protected:
|
||||
virtual const char *getURL() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
|
||||
virtual const char *getOpenTag() { return "<url>"; }
|
||||
virtual const char *getCloseTag() { return "</url>"; }
|
||||
virtual const char *url() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
|
||||
virtual const char *regex() { return "<url>(.*?)</url>"; }
|
||||
|
||||
virtual bool notLyrics(const std::string &data);
|
||||
};
|
||||
@@ -77,8 +65,8 @@ struct GoogleLyricsFetcher : public LyricsFetcher
|
||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() = 0;
|
||||
virtual const char *getURL() { return URL; }
|
||||
virtual const char *url() { return URL; }
|
||||
virtual const char *siteKeyword() { return name(); }
|
||||
|
||||
virtual bool isURLOk(const std::string &url);
|
||||
|
||||
@@ -86,31 +74,14 @@ private:
|
||||
const char *URL;
|
||||
};
|
||||
|
||||
struct LyricstimeFetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "lyricstime.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "lyricstime"; }
|
||||
virtual const char *getOpenTag() { return "<div id=\"songlyrics\" >"; }
|
||||
virtual const char *getCloseTag() { return "</div>"; }
|
||||
|
||||
virtual bool isURLOk(const std::string &url);
|
||||
|
||||
virtual void postProcess(std::string &data);
|
||||
};
|
||||
|
||||
struct MetrolyricsFetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "metrolyrics.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "metrolyrics"; }
|
||||
virtual const char *getOpenTag() { return "<div id=\"lyrics\">"; }
|
||||
virtual const char *getCloseTag() { return "</div>"; }
|
||||
virtual const char *regex() { return "<div id=\"lyrics-body\">(.*?)</div>"; }
|
||||
|
||||
virtual bool isURLOk(const std::string &url);
|
||||
|
||||
virtual void postProcess(std::string &data);
|
||||
};
|
||||
|
||||
@@ -119,55 +90,17 @@ struct LyricsmaniaFetcher : public GoogleLyricsFetcher
|
||||
virtual const char *name() { return "lyricsmania.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "lyricsmania"; }
|
||||
virtual const char *getOpenTag() { return "</strong> :<br />"; }
|
||||
virtual const char *getCloseTag() { return "[ <a"; }
|
||||
virtual const char *regex() { return "<div id='songlyrics_h' class='dn'>(.*?)</div>"; }
|
||||
|
||||
virtual void postProcess(std::string &data);
|
||||
};
|
||||
|
||||
struct SonglyricsFetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "songlyrics.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "songlyrics"; }
|
||||
virtual const char *getOpenTag() { return "-6000px;\">"; }
|
||||
virtual const char *getCloseTag() { return "</p>"; }
|
||||
|
||||
virtual void postProcess(std::string &data);
|
||||
};
|
||||
|
||||
struct LyriczzFetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "lyriczz.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "lyriczz"; }
|
||||
virtual const char *getOpenTag() { return "border=0 /></a>"; }
|
||||
virtual const char *getCloseTag() { return "<a href"; }
|
||||
};
|
||||
|
||||
struct Sing365Fetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "sing365.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "sing365"; }
|
||||
virtual const char *getOpenTag() { return "<br><br></div>"; }
|
||||
virtual const char *getCloseTag() { return "<div align"; }
|
||||
};
|
||||
|
||||
struct LyricsvipFetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "lyricsvip.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "lyricsvip"; }
|
||||
virtual const char *getOpenTag() { return "</h2>"; }
|
||||
virtual const char *getCloseTag() { return "</td>"; }
|
||||
|
||||
virtual void postProcess(std::string &data);
|
||||
virtual const char *regex() { return "<div style=\"font-size: 14px;\">(.*?)</div>"; }
|
||||
};
|
||||
|
||||
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
||||
@@ -175,19 +108,7 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
||||
virtual const char *name() { return "justsomelyrics.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "justsomelyrics"; }
|
||||
virtual const char *getOpenTag() { return "alt=\"phone\" />\n</div>"; }
|
||||
virtual const char *getCloseTag() { return "<div class=\"adsdiv\">"; }
|
||||
};
|
||||
|
||||
struct LoloLyricsFetcher : public GoogleLyricsFetcher
|
||||
{
|
||||
virtual const char *name() { return "lololyrics.com"; }
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "lololyrics"; }
|
||||
virtual const char *getOpenTag() { return "<div class=\"lyrics_txt\" id=\"lyrics_txt\" style=\"font-size:12px; letter-spacing:0.2px; line-height:20px;\">"; }
|
||||
virtual const char *getCloseTag() { return "</div>"; }
|
||||
virtual const char *regex() { return "<p class=\"lyrics\">(.*?)</p>"; }
|
||||
};
|
||||
|
||||
struct InternetLyricsFetcher : public GoogleLyricsFetcher
|
||||
@@ -196,9 +117,8 @@ struct InternetLyricsFetcher : public GoogleLyricsFetcher
|
||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||
|
||||
protected:
|
||||
virtual const char *getSiteKeyword() { return "lyrics"; }
|
||||
virtual const char *getOpenTag() { return ""; }
|
||||
virtual const char *getCloseTag() { return ""; }
|
||||
virtual const char *siteKeyword() { return "lyrics"; }
|
||||
virtual const char *regex() { return ""; }
|
||||
|
||||
virtual bool isURLOk(const std::string &url);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user