lyrics fetcher: throw away broken fetchers and fix/improve existing ones

This commit is contained in:
Andrzej Rybczak
2013-07-09 00:20:04 +02:00
parent af10ac883f
commit 6d6110a52b
2 changed files with 63 additions and 174 deletions

View File

@@ -18,6 +18,7 @@
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. *
***************************************************************************/
#include "config.h"
#include "curl_handle.h"
#ifdef HAVE_CURL_CURL_H
@@ -26,6 +27,7 @@
#include <cstring>
#include <boost/algorithm/string/replace.hpp>
#include <boost/algorithm/string/trim.hpp>
#include <boost/regex.hpp>
#include "charset.h"
#include "lyrics_fetcher.h"
@@ -35,16 +37,10 @@
LyricsFetcher *lyricsPlugins[] =
{
new LyricwikiFetcher(),
new LyricsvipFetcher(),
new Sing365Fetcher(),
new LoloLyricsFetcher(),
new LyriczzFetcher(),
new SonglyricsFetcher(),
new LyricsmaniaFetcher(),
new LyricstimeFetcher(),
new MetrolyricsFetcher(),
new JustSomeLyricsFetcher(),
new LyrcComArFetcher(),
new InternetLyricsFetcher(),
0
};
@@ -56,7 +52,7 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
Result result;
result.first = false;
std::string url = getURL();
std::string url = this->url();
boost::replace_all(url, "%artist%", artist.c_str());
boost::replace_all(url, "%title%", title.c_str());
@@ -69,35 +65,40 @@ LyricsFetcher::Result LyricsFetcher::fetch(const std::string &artist, const std:
return result;
}
bool parse_ok = getContent(getOpenTag(), getCloseTag(), data);
auto lyrics = getContent(regex(), data);
if (!parse_ok || notLyrics(data))
if (lyrics.empty() || notLyrics(data))
{
result.second = msgNotFound;
return result;
}
postProcess(data);
data.clear();
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
{
postProcess(*it);
if (!it->empty())
{
data += *it;
if (it != lyrics.end()-1)
data += "\n\n----------\n\n";
}
}
result.second = data;
result.first = true;
return result;
}
bool LyricsFetcher::getContent(const char *open_tag, const char *close_tag, std::string &data)
std::vector<std::string> LyricsFetcher::getContent(const char *regex, const std::string &data)
{
size_t a, b;
if ((a = data.find(open_tag)) != std::string::npos)
{
a += strlen(open_tag);
if ((b = data.find(close_tag, a)) != std::string::npos)
data = data.substr(a, b-a);
else
return false;
}
else
return false;
return true;
std::vector<std::string> result;
boost::regex rx(regex);
auto first = boost::sregex_iterator(data.begin(), data.end(), rx);
auto last = boost::sregex_iterator();
for (; first != last; ++first)
result.push_back(first->str(1));
return result;
}
void LyricsFetcher::postProcess(std::string &data)
@@ -124,23 +125,36 @@ LyricsFetcher::Result LyricwikiFetcher::fetch(const std::string &artist, const s
return result;
}
bool parse_ok = getContent("'17'/></a></div>", "<!--", data);
auto lyrics = getContent("<div class='lyricbox'><div class='rtMatcher'>.*?</div>(.*?)<!--", data);
if (!parse_ok)
if (lyrics.empty())
{
result.second = msgNotFound;
return result;
}
data = unescapeHtmlUtf8(data);
if (data.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos)
std::transform(lyrics.begin(), lyrics.end(), lyrics.begin(), unescapeHtmlUtf8);
bool license_restriction = std::any_of(lyrics.begin(), lyrics.end(), [](const std::string &s) {
return s.find("Unfortunately, we are not licensed to display the full lyrics for this song at the moment.") != std::string::npos;
});
if (license_restriction)
{
result.second = "Licence restriction";
return result;
}
boost::replace_all(data, "<br />", "\n");
stripHtmlTags(data);
boost::trim(data);
data.clear();
for (auto it = lyrics.begin(); it != lyrics.end(); ++it)
{
boost::replace_all(*it, "<br />", "\n");
stripHtmlTags(*it);
boost::trim(*it);
if (!it->empty())
{
data += *it;
if (it != lyrics.end()-1)
data += "\n\n----------\n\n";
}
}
result.second = data;
result.first = true;
@@ -163,8 +177,8 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
std::string search_str = artist;
search_str += "+";
search_str += title;
search_str += "+";
search_str += getSiteKeyword();
search_str += "+%2B";
search_str += siteKeyword();
std::string google_url = "http://www.google.com/search?hl=en&ie=UTF-8&oe=UTF-8&q=";
google_url += search_str;
@@ -179,15 +193,15 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
return result;
}
bool found_url = getContent("<A HREF=\"", "\">here</A>", data);
auto urls = getContent("<A HREF=\"(.*?)\">here</A>", data);
if (!found_url || !isURLOk(data))
if (urls.empty() || !isURLOk(urls[0]))
{
result.second = msgNotFound;
return result;
}
data = unescapeHtmlUtf8(data);
data = unescapeHtmlUtf8(urls[0]);
//result.second = data;
//return result;
@@ -197,25 +211,7 @@ LyricsFetcher::Result GoogleLyricsFetcher::fetch(const std::string &artist, cons
bool GoogleLyricsFetcher::isURLOk(const std::string &url)
{
return url.find(getSiteKeyword()) != std::string::npos;
}
/**********************************************************************/
bool LyricstimeFetcher::isURLOk(const std::string &url)
{
// it sometimes returns list of all artists that begin
// with a given letter, e.g. www.lyricstime.com/A.html, which
// is 25 chars long, so we want longer.
return GoogleLyricsFetcher::isURLOk(url) && url.length() > 25;
}
void LyricstimeFetcher::postProcess(std::string &data)
{
// lyricstime.com uses iso-8859-1 as the encoding
// so we need to convert obtained lyrics to utf-8
data = Charset::toUtf8From(data, "iso-8859-1");
LyricsFetcher::postProcess(data);
return url.find(siteKeyword()) != std::string::npos;
}
/**********************************************************************/
@@ -253,33 +249,6 @@ void LyricsmaniaFetcher::postProcess(std::string &data)
/**********************************************************************/
void SonglyricsFetcher::postProcess(std::string &data)
{
// throw away [ ... lyrics are found on www.songlyrics.com ] info.
// there is +2 instead of +1 in third line because there is extra
// space after ] we also want to get rid of
size_t i = data.find('['), j = data.find(']');
if (i != std::string::npos && i != std::string::npos)
data.replace(i, j-i+2, "");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
void LyricsvipFetcher::postProcess(std::string &data)
{
// throw away <div> with ad
size_t i = data.find("<div class=\"ad\""), j = data.find("</div>");
if (i != std::string::npos && i != std::string::npos)
data.replace(i, j-i+const_strlen("</div>"), "");
data = unescapeHtmlUtf8(data);
LyricsFetcher::postProcess(data);
}
/**********************************************************************/
LyricsFetcher::Result InternetLyricsFetcher::fetch(const std::string &artist, const std::string &title)
{
GoogleLyricsFetcher::fetch(artist, title);