lyrics fetcher: throw away broken fetchers and fix/improve existing ones
This commit is contained in:
@@ -18,6 +18,7 @@
|
|||||||
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||||||
***************************************************************************/
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include "config.h"
|
||||||
#include "curl_handle.h"
|
#include "curl_handle.h"
|
||||||
|
|
||||||
#ifdef HAVE_CURL_CURL_H
|
#ifdef HAVE_CURL_CURL_H
|
||||||
@@ -26,6 +27,7 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <boost/algorithm/string/replace.hpp>
|
#include <boost/algorithm/string/replace.hpp>
|
||||||
#include <boost/algorithm/string/trim.hpp>
|
#include <boost/algorithm/string/trim.hpp>
|
||||||
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
#include "charset.h"
|
#include "charset.h"
|
||||||
#include "lyrics_fetcher.h"
|
#include "lyrics_fetcher.h"
|
||||||
@@ -35,16 +37,10 @@
|
|||||||
LyricsFetcher *lyricsPlugins[] =
|
LyricsFetcher *lyricsPlugins[] =
|
||||||
{
|
{
|
||||||
new LyricwikiFetcher(),
|
new LyricwikiFetcher(),
|
||||||
new LyricsvipFetcher(),
|
|
||||||
new Sing365Fetcher(),
|
new Sing365Fetcher(),
|
||||||
new LoloLyricsFetcher(),
|
|
||||||
new LyriczzFetcher(),
|
|
||||||
new SonglyricsFetcher(),
|
|
||||||
new LyricsmaniaFetcher(),
|
new LyricsmaniaFetcher(),
|
||||||
new LyricstimeFetcher(),
|
|
||||||
new MetrolyricsFetcher(),
|
new MetrolyricsFetcher(),
|
||||||
new JustSomeLyricsFetcher(),
|
new JustSomeLyricsFetcher(),
|
||||||
new LyrcComArFetcher(),
|
|
||||||
new InternetLyricsFetcher(),
|
new InternetLyricsFetcher(),
|
||||||
0
|
0
|
||||||
};
|
};
|
||||||
@@ -56,7 +52,7 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
|
|||||||
Result result;
|
Result result;
|
||||||
result.first = false;
|
result.first = false;
|
||||||
|
|
||||||
std::string url = getURL();
|
std::string url = this->url();
|
||||||
boost::replace_all(url, "%artist%", artist.c_str());
|
boost::replace_all(url, "%artist%", artist.c_str());
|
||||||
boost::replace_all(url, "%title%", title.c_str());
|
boost::replace_all(url, "%title%", title.c_str());
|
||||||
|
|
||||||
@@ -69,35 +65,40 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool parse_ok = getContent(getOpenTag(), getCloseTag(), data);
|
auto lyrics = getContent(regex(), data);
|
||||||
|
|
||||||
if (!parse_ok || notLyrics(data))
|
if (lyrics.empty() || notLyrics(data))
|
||||||
{
|
{
|
||||||
result.second = msgNotFound;
|
result.second = msgNotFound;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
postProcess(data);
|
data.clear();
|
||||||
|
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||||
|
{
|
||||||
|
postProcess(*it);
|
||||||
|
if (!it->empty())
|
||||||
|
{
|
||||||
|
data += *it;
|
||||||
|
if (it != lyrics.end()-1)
|
||||||
|
data += "\n\n----------\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result.second = data;
|
result.second = data;
|
||||||
result.first = true;
|
result.first = true;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool LyricsFetcher::getContent(const char *open_tag, const char *close_tag, std::string &data)
|
std::vector<std::string> LyricsFetcher::getContent(const char *regex, const std::string &data)
|
||||||
{
|
{
|
||||||
size_t a, b;
|
std::vector<std::string> result;
|
||||||
if ((a = data.find(open_tag)) != std::string::npos)
|
boost::regex rx(regex);
|
||||||
{
|
auto first = boost::sregex_iterator(data.begin(), data.end(), rx);
|
||||||
a += strlen(open_tag);
|
auto last = boost::sregex_iterator();
|
||||||
if ((b = data.find(close_tag, a)) != std::string::npos)
|
for (; first != last; ++first)
|
||||||
data = data.substr(a, b-a);
|
result.push_back(first->str(1));
|
||||||
else
|
return result;
|
||||||
return false;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
return false;
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void LyricsFetcher::postProcess(std::string &data)
|
void LyricsFetcher::postProcess(std::string &data)
|
||||||
@@ -124,23 +125,36 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool parse_ok = getContent("'17'/></a></div>", "<!--", data);
|
auto lyrics = getContent("<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--", data);
|
||||||
|
|
||||||
if (!parse_ok)
|
if (lyrics.empty())
|
||||||
{
|
{
|
||||||
result.second = msgNotFound;
|
result.second = msgNotFound;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
data = unescapeHtmlUtf8(data);
|
std::transform(lyrics.begin(), lyrics.end(), lyrics.begin(), unescapeHtmlUtf8);
|
||||||
if (data.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos)
|
bool license_restriction = std::any_of(lyrics.begin(), lyrics.end(), [](const std::string &s) {
|
||||||
|
return s.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos;
|
||||||
|
});
|
||||||
|
if (license_restriction)
|
||||||
{
|
{
|
||||||
result.second = "Licence restriction";
|
result.second = "Licence restriction";
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
boost::replace_all(data, "<br />", "\n");
|
data.clear();
|
||||||
stripHtmlTags(data);
|
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||||
boost::trim(data);
|
{
|
||||||
|
boost::replace_all(*it, "<br />", "\n");
|
||||||
|
stripHtmlTags(*it);
|
||||||
|
boost::trim(*it);
|
||||||
|
if (!it->empty())
|
||||||
|
{
|
||||||
|
data += *it;
|
||||||
|
if (it != lyrics.end()-1)
|
||||||
|
data += "\n\n----------\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
result.second = data;
|
result.second = data;
|
||||||
result.first = true;
|
result.first = true;
|
||||||
@@ -163,8 +177,8 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
|||||||
std::string search_str = artist;
|
std::string search_str = artist;
|
||||||
search_str += "+";
|
search_str += "+";
|
||||||
search_str += title;
|
search_str += title;
|
||||||
search_str += "+";
|
search_str += "+%2B";
|
||||||
search_str += getSiteKeyword();
|
search_str += siteKeyword();
|
||||||
|
|
||||||
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
|
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
|
||||||
google_url += search_str;
|
google_url += search_str;
|
||||||
@@ -179,15 +193,15 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool found_url = getContent("<A HREF=\"", "\">here</A>", data);
|
auto urls = getContent("<A HREF=\"(.*?)\">here</A>", data);
|
||||||
|
|
||||||
if (!found_url || !isURLOk(data))
|
if (urls.empty() || !isURLOk(urls[0]))
|
||||||
{
|
{
|
||||||
result.second = msgNotFound;
|
result.second = msgNotFound;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
data = unescapeHtmlUtf8(data);
|
data = unescapeHtmlUtf8(urls[0]);
|
||||||
//result.second = data;
|
//result.second = data;
|
||||||
//return result;
|
//return result;
|
||||||
|
|
||||||
@@ -197,25 +211,7 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
|||||||
|
|
||||||
bool GoogleLyricsFetcher::isURLOk(const std::string &url)
|
bool GoogleLyricsFetcher::isURLOk(const std::string &url)
|
||||||
{
|
{
|
||||||
return url.find(getSiteKeyword()) != std::string::npos;
|
return url.find(siteKeyword()) != std::string::npos;
|
||||||
}
|
|
||||||
|
|
||||||
/**********************************************************************/
|
|
||||||
|
|
||||||
bool LyricstimeFetcher::isURLOk(const std::string &url)
|
|
||||||
{
|
|
||||||
// it sometimes returns list of all artists that begin
|
|
||||||
// with a given letter, e.g. www.lyricstime.com/A.html, which
|
|
||||||
// is 25 chars long, so we want longer.
|
|
||||||
return GoogleLyricsFetcher::isURLOk(url) && url.length() > 25;
|
|
||||||
}
|
|
||||||
|
|
||||||
void LyricstimeFetcher::postProcess(std::string &data)
|
|
||||||
{
|
|
||||||
// lyricstime.com uses iso-8859-1 as the encoding
|
|
||||||
// so we need to convert obtained lyrics to utf-8
|
|
||||||
data = Charset::toUtf8From(data, "iso-8859-1");
|
|
||||||
LyricsFetcher::postProcess(data);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**********************************************************************/
|
/**********************************************************************/
|
||||||
@@ -253,33 +249,6 @@ void LyricsmaniaFetcher::postProcess(std::string &data)
|
|||||||
|
|
||||||
/**********************************************************************/
|
/**********************************************************************/
|
||||||
|
|
||||||
void SonglyricsFetcher::postProcess(std::string &data)
|
|
||||||
{
|
|
||||||
// throw away [ ... lyrics are found on www.songlyrics.com ] info.
|
|
||||||
// there is +2 instead of +1 in third line because there is extra
|
|
||||||
// space after ] we also want to get rid of
|
|
||||||
size_t i = data.find('['), j = data.find(']');
|
|
||||||
if (i != std::string::npos && i != std::string::npos)
|
|
||||||
data.replace(i, j-i+2, "");
|
|
||||||
data = unescapeHtmlUtf8(data);
|
|
||||||
LyricsFetcher::postProcess(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**********************************************************************/
|
|
||||||
|
|
||||||
void LyricsvipFetcher::postProcess(std::string &data)
|
|
||||||
{
|
|
||||||
// throw away <div> with ad
|
|
||||||
size_t i = data.find("<div class=\"ad\""), j = data.find("</div>");
|
|
||||||
if (i != std::string::npos && i != std::string::npos)
|
|
||||||
data.replace(i, j-i+const_strlen("</div>"), "");
|
|
||||||
data = unescapeHtmlUtf8(data);
|
|
||||||
LyricsFetcher::postProcess(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**********************************************************************/
|
|
||||||
|
|
||||||
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
|
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
|
||||||
{
|
{
|
||||||
GoogleLyricsFetcher::fetch(artist, title);
|
GoogleLyricsFetcher::fetch(artist, title);
|
||||||
|
|||||||
@@ -35,37 +35,25 @@ struct LyricsFetcher
|
|||||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getURL() = 0;
|
virtual const char *url() = 0;
|
||||||
virtual const char *getOpenTag() = 0;
|
virtual const char *regex() = 0;
|
||||||
virtual const char *getCloseTag() = 0;
|
|
||||||
|
|
||||||
virtual bool notLyrics(const std::string &) { return false; }
|
virtual bool notLyrics(const std::string &) { return false; }
|
||||||
virtual void postProcess(std::string &data);
|
virtual void postProcess(std::string &data);
|
||||||
|
|
||||||
bool getContent(const char *open_tag, const char *close_tag, std::string &data);
|
std::vector<std::string> getContent(const char *regex, const std::string &data);
|
||||||
|
|
||||||
static const char msgNotFound[];
|
static const char msgNotFound[];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LyrcComArFetcher : public LyricsFetcher
|
|
||||||
{
|
|
||||||
virtual const char *name() { return "lyrc.com.ar"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual const char *getURL() { return "http://lyrc.com.ar/tema1es.php?artist=%artist%&songname=%title%"; }
|
|
||||||
virtual const char *getOpenTag() { return "</table>"; }
|
|
||||||
virtual const char *getCloseTag() { return "<p>"; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct LyricwikiFetcher : public LyricsFetcher
|
struct LyricwikiFetcher : public LyricsFetcher
|
||||||
{
|
{
|
||||||
virtual const char *name() { return "lyricwiki.com"; }
|
virtual const char *name() { return "lyricwiki.com"; }
|
||||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getURL() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
|
virtual const char *url() { return "http://lyrics.wikia.com/api.php?action=lyrics&fmt=xml&func=getSong&artist=%artist%&song=%title%"; }
|
||||||
virtual const char *getOpenTag() { return "<url>"; }
|
virtual const char *regex() { return "<url>(.*?)</url>"; }
|
||||||
virtual const char *getCloseTag() { return "</url>"; }
|
|
||||||
|
|
||||||
virtual bool notLyrics(const std::string &data);
|
virtual bool notLyrics(const std::string &data);
|
||||||
};
|
};
|
||||||
@@ -77,8 +65,8 @@ struct GoogleLyricsFetcher : public LyricsFetcher
|
|||||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getSiteKeyword() = 0;
|
virtual const char *url() { return URL; }
|
||||||
virtual const char *getURL() { return URL; }
|
virtual const char *siteKeyword() { return name(); }
|
||||||
|
|
||||||
virtual bool isURLOk(const std::string &url);
|
virtual bool isURLOk(const std::string &url);
|
||||||
|
|
||||||
@@ -86,31 +74,14 @@ private:
|
|||||||
const char *URL;
|
const char *URL;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct LyricstimeFetcher : public GoogleLyricsFetcher
|
|
||||||
{
|
|
||||||
virtual const char *name() { return "lyricstime.com"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual const char *getSiteKeyword() { return "lyricstime"; }
|
|
||||||
virtual const char *getOpenTag() { return "<div id=\"songlyrics\" >"; }
|
|
||||||
virtual const char *getCloseTag() { return "</div>"; }
|
|
||||||
|
|
||||||
virtual bool isURLOk(const std::string &url);
|
|
||||||
|
|
||||||
virtual void postProcess(std::string &data);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct MetrolyricsFetcher : public GoogleLyricsFetcher
|
struct MetrolyricsFetcher : public GoogleLyricsFetcher
|
||||||
{
|
{
|
||||||
virtual const char *name() { return "metrolyrics.com"; }
|
virtual const char *name() { return "metrolyrics.com"; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getSiteKeyword() { return "metrolyrics"; }
|
virtual const char *regex() { return "<div id=\"lyrics-body\">(.*?)</div>"; }
|
||||||
virtual const char *getOpenTag() { return "<div id=\"lyrics\">"; }
|
|
||||||
virtual const char *getCloseTag() { return "</div>"; }
|
|
||||||
|
|
||||||
virtual bool isURLOk(const std::string &url);
|
virtual bool isURLOk(const std::string &url);
|
||||||
|
|
||||||
virtual void postProcess(std::string &data);
|
virtual void postProcess(std::string &data);
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -119,55 +90,17 @@ struct LyricsmaniaFetcher : public GoogleLyricsFetcher
|
|||||||
virtual const char *name() { return "lyricsmania.com"; }
|
virtual const char *name() { return "lyricsmania.com"; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getSiteKeyword() { return "lyricsmania"; }
|
virtual const char *regex() { return "<div id='songlyrics_h' class='dn'>(.*?)</div>"; }
|
||||||
virtual const char *getOpenTag() { return "</strong> :<br />"; }
|
|
||||||
virtual const char *getCloseTag() { return "[ <a"; }
|
|
||||||
|
|
||||||
virtual void postProcess(std::string &data);
|
virtual void postProcess(std::string &data);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct SonglyricsFetcher : public GoogleLyricsFetcher
|
|
||||||
{
|
|
||||||
virtual const char *name() { return "songlyrics.com"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual const char *getSiteKeyword() { return "songlyrics"; }
|
|
||||||
virtual const char *getOpenTag() { return "-6000px;\">"; }
|
|
||||||
virtual const char *getCloseTag() { return "</p>"; }
|
|
||||||
|
|
||||||
virtual void postProcess(std::string &data);
|
|
||||||
};
|
|
||||||
|
|
||||||
struct LyriczzFetcher : public GoogleLyricsFetcher
|
|
||||||
{
|
|
||||||
virtual const char *name() { return "lyriczz.com"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual const char *getSiteKeyword() { return "lyriczz"; }
|
|
||||||
virtual const char *getOpenTag() { return "border=0 /></a>"; }
|
|
||||||
virtual const char *getCloseTag() { return "<a href"; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct Sing365Fetcher : public GoogleLyricsFetcher
|
struct Sing365Fetcher : public GoogleLyricsFetcher
|
||||||
{
|
{
|
||||||
virtual const char *name() { return "sing365.com"; }
|
virtual const char *name() { return "sing365.com"; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getSiteKeyword() { return "sing365"; }
|
virtual const char *regex() { return "<div style=\"font-size: 14px;\">(.*?)</div>"; }
|
||||||
virtual const char *getOpenTag() { return "<br><br></div>"; }
|
|
||||||
virtual const char *getCloseTag() { return "<div align"; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct LyricsvipFetcher : public GoogleLyricsFetcher
|
|
||||||
{
|
|
||||||
virtual const char *name() { return "lyricsvip.com"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual const char *getSiteKeyword() { return "lyricsvip"; }
|
|
||||||
virtual const char *getOpenTag() { return "</h2>"; }
|
|
||||||
virtual const char *getCloseTag() { return "</td>"; }
|
|
||||||
|
|
||||||
virtual void postProcess(std::string &data);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
||||||
@@ -175,19 +108,7 @@ struct JustSomeLyricsFetcher : public GoogleLyricsFetcher
|
|||||||
virtual const char *name() { return "justsomelyrics.com"; }
|
virtual const char *name() { return "justsomelyrics.com"; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getSiteKeyword() { return "justsomelyrics"; }
|
virtual const char *regex() { return "<p class=\"lyrics\">(.*?)</p>"; }
|
||||||
virtual const char *getOpenTag() { return "alt=\"phone\" />\n</div>"; }
|
|
||||||
virtual const char *getCloseTag() { return "<div class=\"adsdiv\">"; }
|
|
||||||
};
|
|
||||||
|
|
||||||
struct LoloLyricsFetcher : public GoogleLyricsFetcher
|
|
||||||
{
|
|
||||||
virtual const char *name() { return "lololyrics.com"; }
|
|
||||||
|
|
||||||
protected:
|
|
||||||
virtual const char *getSiteKeyword() { return "lololyrics"; }
|
|
||||||
virtual const char *getOpenTag() { return "<div class=\"lyrics_txt\" id=\"lyrics_txt\" style=\"font-size:12px; letter-spacing:0.2px; line-height:20px;\">"; }
|
|
||||||
virtual const char *getCloseTag() { return "</div>"; }
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct InternetLyricsFetcher : public GoogleLyricsFetcher
|
struct InternetLyricsFetcher : public GoogleLyricsFetcher
|
||||||
@@ -196,9 +117,8 @@ struct InternetLyricsFetcher : public GoogleLyricsFetcher
|
|||||||
virtual Result fetch(const std::string &artist, const std::string &title);
|
virtual Result fetch(const std::string &artist, const std::string &title);
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual const char *getSiteKeyword() { return "lyrics"; }
|
virtual const char *siteKeyword() { return "lyrics"; }
|
||||||
virtual const char *getOpenTag() { return ""; }
|
virtual const char *regex() { return ""; }
|
||||||
virtual const char *getCloseTag() { return ""; }
|
|
||||||
|
|
||||||
virtual bool isURLOk(const std::string &url);
|
virtual bool isURLOk(const std::string &url);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user