lyrics fetcher: throw away broken fetchers and fix/improve existing ones
This commit is contained in:
@@ -18,6 +18,7 @@
|
||||
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
|
||||
***************************************************************************/
|
||||
|
||||
#include "config.h"
|
||||
#include "curl_handle.h"
|
||||
|
||||
#ifdef HAVE_CURL_CURL_H
|
||||
@@ -26,6 +27,7 @@
|
||||
#include <cstring>
|
||||
#include <boost/algorithm/string/replace.hpp>
|
||||
#include <boost/algorithm/string/trim.hpp>
|
||||
#include <boost/regex.hpp>
|
||||
|
||||
#include "charset.h"
|
||||
#include "lyrics_fetcher.h"
|
||||
@@ -35,16 +37,10 @@
|
||||
LyricsFetcher *lyricsPlugins[] =
|
||||
{
|
||||
new LyricwikiFetcher(),
|
||||
new LyricsvipFetcher(),
|
||||
new Sing365Fetcher(),
|
||||
new LoloLyricsFetcher(),
|
||||
new LyriczzFetcher(),
|
||||
new SonglyricsFetcher(),
|
||||
new LyricsmaniaFetcher(),
|
||||
new LyricstimeFetcher(),
|
||||
new MetrolyricsFetcher(),
|
||||
new JustSomeLyricsFetcher(),
|
||||
new LyrcComArFetcher(),
|
||||
new InternetLyricsFetcher(),
|
||||
0
|
||||
};
|
||||
@@ -56,7 +52,7 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
|
||||
Result result;
|
||||
result.first = false;
|
||||
|
||||
std::string url = getURL();
|
||||
std::string url = this->url();
|
||||
boost::replace_all(url, "%artist%", artist.c_str());
|
||||
boost::replace_all(url, "%title%", title.c_str());
|
||||
|
||||
@@ -69,35 +65,40 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
|
||||
return result;
|
||||
}
|
||||
|
||||
bool parse_ok = getContent(getOpenTag(), getCloseTag(), data);
|
||||
auto lyrics = getContent(regex(), data);
|
||||
|
||||
if (!parse_ok || notLyrics(data))
|
||||
if (lyrics.empty() || notLyrics(data))
|
||||
{
|
||||
result.second = msgNotFound;
|
||||
return result;
|
||||
}
|
||||
|
||||
postProcess(data);
|
||||
data.clear();
|
||||
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||
{
|
||||
postProcess(*it);
|
||||
if (!it->empty())
|
||||
{
|
||||
data += *it;
|
||||
if (it != lyrics.end()-1)
|
||||
data += "\n\n----------\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
result.second = data;
|
||||
result.first = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
bool LyricsFetcher::getContent(const char *open_tag, const char *close_tag, std::string &data)
|
||||
std::vector<std::string> LyricsFetcher::getContent(const char *regex, const std::string &data)
|
||||
{
|
||||
size_t a, b;
|
||||
if ((a = data.find(open_tag)) != std::string::npos)
|
||||
{
|
||||
a += strlen(open_tag);
|
||||
if ((b = data.find(close_tag, a)) != std::string::npos)
|
||||
data = data.substr(a, b-a);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
return true;
|
||||
std::vector<std::string> result;
|
||||
boost::regex rx(regex);
|
||||
auto first = boost::sregex_iterator(data.begin(), data.end(), rx);
|
||||
auto last = boost::sregex_iterator();
|
||||
for (; first != last; ++first)
|
||||
result.push_back(first->str(1));
|
||||
return result;
|
||||
}
|
||||
|
||||
void LyricsFetcher::postProcess(std::string &data)
|
||||
@@ -124,23 +125,36 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
|
||||
return result;
|
||||
}
|
||||
|
||||
bool parse_ok = getContent("'17'/></a></div>", "<!--", data);
|
||||
auto lyrics = getContent("<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--", data);
|
||||
|
||||
if (!parse_ok)
|
||||
if (lyrics.empty())
|
||||
{
|
||||
result.second = msgNotFound;
|
||||
return result;
|
||||
}
|
||||
data = unescapeHtmlUtf8(data);
|
||||
if (data.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos)
|
||||
std::transform(lyrics.begin(), lyrics.end(), lyrics.begin(), unescapeHtmlUtf8);
|
||||
bool license_restriction = std::any_of(lyrics.begin(), lyrics.end(), [](const std::string &s) {
|
||||
return s.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos;
|
||||
});
|
||||
if (license_restriction)
|
||||
{
|
||||
result.second = "Licence restriction";
|
||||
return result;
|
||||
}
|
||||
|
||||
boost::replace_all(data, "<br />", "\n");
|
||||
stripHtmlTags(data);
|
||||
boost::trim(data);
|
||||
data.clear();
|
||||
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
|
||||
{
|
||||
boost::replace_all(*it, "<br />", "\n");
|
||||
stripHtmlTags(*it);
|
||||
boost::trim(*it);
|
||||
if (!it->empty())
|
||||
{
|
||||
data += *it;
|
||||
if (it != lyrics.end()-1)
|
||||
data += "\n\n----------\n\n";
|
||||
}
|
||||
}
|
||||
|
||||
result.second = data;
|
||||
result.first = true;
|
||||
@@ -163,8 +177,8 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
||||
std::string search_str = artist;
|
||||
search_str += "+";
|
||||
search_str += title;
|
||||
search_str += "+";
|
||||
search_str += getSiteKeyword();
|
||||
search_str += "+%2B";
|
||||
search_str += siteKeyword();
|
||||
|
||||
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
|
||||
google_url += search_str;
|
||||
@@ -179,15 +193,15 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
||||
return result;
|
||||
}
|
||||
|
||||
bool found_url = getContent("<A HREF=\"", "\">here</A>", data);
|
||||
auto urls = getContent("<A HREF=\"(.*?)\">here</A>", data);
|
||||
|
||||
if (!found_url || !isURLOk(data))
|
||||
if (urls.empty() || !isURLOk(urls[0]))
|
||||
{
|
||||
result.second = msgNotFound;
|
||||
return result;
|
||||
}
|
||||
|
||||
data = unescapeHtmlUtf8(data);
|
||||
data = unescapeHtmlUtf8(urls[0]);
|
||||
//result.second = data;
|
||||
//return result;
|
||||
|
||||
@@ -197,25 +211,7 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
|
||||
|
||||
bool GoogleLyricsFetcher::isURLOk(const std::string &url)
|
||||
{
|
||||
return url.find(getSiteKeyword()) != std::string::npos;
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
bool LyricstimeFetcher::isURLOk(const std::string &url)
|
||||
{
|
||||
// it sometimes returns list of all artists that begin
|
||||
// with a given letter, e.g. www.lyricstime.com/A.html, which
|
||||
// is 25 chars long, so we want longer.
|
||||
return GoogleLyricsFetcher::isURLOk(url) && url.length() > 25;
|
||||
}
|
||||
|
||||
void LyricstimeFetcher::postProcess(std::string &data)
|
||||
{
|
||||
// lyricstime.com uses iso-8859-1 as the encoding
|
||||
// so we need to convert obtained lyrics to utf-8
|
||||
data = Charset::toUtf8From(data, "iso-8859-1");
|
||||
LyricsFetcher::postProcess(data);
|
||||
return url.find(siteKeyword()) != std::string::npos;
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
@@ -253,33 +249,6 @@ void LyricsmaniaFetcher::postProcess(std::string &data)
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void SonglyricsFetcher::postProcess(std::string &data)
|
||||
{
|
||||
// throw away [ ... lyrics are found on www.songlyrics.com ] info.
|
||||
// there is +2 instead of +1 in third line because there is extra
|
||||
// space after ] we also want to get rid of
|
||||
size_t i = data.find('['), j = data.find(']');
|
||||
if (i != std::string::npos && i != std::string::npos)
|
||||
data.replace(i, j-i+2, "");
|
||||
data = unescapeHtmlUtf8(data);
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
void LyricsvipFetcher::postProcess(std::string &data)
|
||||
{
|
||||
// throw away <div> with ad
|
||||
size_t i = data.find("<div class=\"ad\""), j = data.find("</div>");
|
||||
if (i != std::string::npos && i != std::string::npos)
|
||||
data.replace(i, j-i+const_strlen("</div>"), "");
|
||||
data = unescapeHtmlUtf8(data);
|
||||
LyricsFetcher::postProcess(data);
|
||||
}
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
|
||||
{
|
||||
GoogleLyricsFetcher::fetch(artist, title);
|
||||
|
||||
Reference in New Issue
Block a user