commit 39eb9205cf2e578d3e369613cd37b3114d7ed3bb Author: Mitch Weaver <20451170+MitchWeaver@users.noreply.github.com> Date: Mon Apr 23 17:56:32 2018 +0200 initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5051b0e --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.pyc +*.core +text-scraping.py +notes +notes.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..7d3decf --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +# metal-archives web scrapers + +Helpful scripts to get information from http://metal-archives.com + +### releases +![releases.png](res/releases.png) + +### cover +![cover.png](res/cover.png) + +### band +![band.png](res/band.png) + +### logo +![logo.png](res/logo.png) + +### lyrics +![lyrics.png](res/lyrics.png) + +### songs +![songs.png](res/songs.png) + +### genre +![genre.png](res/genre.png) + +### location +![location.png](res/location.png) + + +Originally these were methods from http://github.com/mitchweaver/diskvlt-bot + +But as they are so handy, I now use them to categorize and tag my music. + + diff --git a/band b/band new file mode 100755 index 0000000..e15828c --- /dev/null +++ b/band @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get a given band pic from metal-archives.com +# +# Usage: ./band 'judas iscariot' +# + +from bs4 import BeautifulSoup +import urllib +import sys + +def band_pic(band): + try: + url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower() + + def get_soup(url,header): + return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib") + + header = {'User-Agent': 'Mozilla/5.0'} + image_urls = [a['src'] for a in get_soup(url, header).find_all("img")] + + image = urllib.request.urlopen(image_urls[2]).read() + + file = open("./band.jpg", 'wb') + file.write(image) + file.close() + except: + print("Error - can't find it?") + +def main(): + band_pic(sys.argv[1]) + +if __name__ == "__main__": main() diff --git a/cover b/cover new file mode 100755 index 0000000..e568f70 --- /dev/null +++ b/cover @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get a given album cover from metal-archives.com +# +# Usage: ./album 'hellhammer - death fiend' +# + +from bs4 import BeautifulSoup + +import requests +import sys +import re +import urllib + +def find_album_url(band, album): + + def get_soup(url): + data = requests.get(url).text + return BeautifulSoup(data, "html5lib") + + url = 'https://www.metal-archives.com/bands/' + band.lower() + soup = get_soup(url) + + for link in soup.find_all('a'): + strlink = str(link) + # scrape the band page for the discography link + # we need to do this because MA hides it behind an ID# + if 'iscography' in strlink and 'omplete' in strlink: + # sed out the href garbage + url = strlink.replace('Complete discography', '') + # scrape the album page + for link in get_soup(url).find_all('a'): + # sed out the reviews + if not 'reviews' in str(link): + # look for the album + if album.lower() in str(link).lower(): + urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link)) + return(urls[0].replace('"', '')) + +def get_album_art(url): + try: + def get_soup(url,header): + return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib") + + header = {'User-Agent': 'Mozilla/5.0'} + image_urls = [a['src'] for a in get_soup(url, header).find_all("img")] + + image = urllib.request.urlopen(image_urls[1]).read() + + file = open("./cover.jpg", 'wb') + file.write(image) + file.close() + except: + print("Error - can't find it?") + +def main(): + arg = sys.argv[1].split(' - ') + url = find_album_url(arg[0], arg[1]) + get_album_art(url) + +if __name__ == "__main__": main() diff --git a/genre b/genre new file mode 100755 index 0000000..be563bd --- /dev/null +++ b/genre @@ -0,0 +1,57 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get the genre of a given band from metal-archives +# +# Usage: ./genre 'darkthrone' +# + +from bs4 import BeautifulSoup +from bs4.element import Comment +import urllib.request +import sys + +def tag_visible(element): + if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: + return False + if isinstance(element, Comment): + return False + return True + +def text_from_html(body): + soup = BeautifulSoup(body, 'html.parser') + texts = soup.findAll(text=True) + visible_texts = filter(tag_visible, texts) + lines = [] + for line in visible_texts: + lines.append(line) + return lines + + # return " ".join(t.strip() for t in visible_texts) + +def parse(url): + try: + html = urllib.request.urlopen(url).read() + + genre = [] + found_count = 0 + for line in text_from_html(html): + if len(line.strip()) < 2: continue + if 'Genre:' in line: + genre.append(line) + found_count = 1 + elif found_count > 0: + genre.append(line) + found_count += 1 + if found_count == 2: + break + + print(genre[0] + ' ' + genre[1]) + except: + print("Can't find it?") + +def main(): + parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1]) + +if __name__ == "__main__": main() diff --git a/location b/location new file mode 100755 index 0000000..e8cbd83 --- /dev/null +++ b/location @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get the location of a given band from metal-archives +# +# Usage: ./location 'black sabbath' +# + +from bs4 import BeautifulSoup +from bs4.element import Comment +import urllib.request +import sys + +def tag_visible(element): + if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: + return False + if isinstance(element, Comment): + return False + return True + +def text_from_html(body): + soup = BeautifulSoup(body, 'html.parser') + texts = soup.findAll(text=True) + visible_texts = filter(tag_visible, texts) + lines = [] + for line in visible_texts: + lines.append(line) + return lines + +def parse(url): + try: + html = urllib.request.urlopen(url.replace(' ', '_').lower()).read() + + location = [] + found_count = 0 + for line in text_from_html(html): + if len(line.strip()) < 2: continue + if 'Country of origin:' in line: + location.append(line) + found_count = 1 + elif found_count > 0: + location.append(line) + found_count += 1 + if found_count == 4: + break + + print(location[0] + ' ' + location[1]) + print(location[2] + ' ' + location[3]) + except: + print("Can't find it?") + +def main(): + parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1]) + +if __name__ == "__main__": main() diff --git a/logo b/logo new file mode 100755 index 0000000..34828a1 --- /dev/null +++ b/logo @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get a given band logo from metal-archives.com +# + +from bs4 import BeautifulSoup +import urllib +import sys + +def band_logo(band): + try: + url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower() + + def get_soup(url,header): + return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib") + + header = {'User-Agent': 'Mozilla/5.0'} + image_urls = [a['src'] for a in get_soup(url, header).find_all("img")] + + image = urllib.request.urlopen(image_urls[1]).read() + + file = open("./logo.jpg", 'wb') + file.write(image) + file.close() + except: + print("Error - can't find it?") + +def main(): + band_logo(sys.argv[1]) + +if __name__ == "__main__": main() diff --git a/releases b/releases new file mode 100755 index 0000000..be3be27 --- /dev/null +++ b/releases @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get releases of a band from metal-archives +# +# Usage: ./releases 'iron maiden' +# + +from bs4 import BeautifulSoup +import requests +import urllib +import re +import sys + +class Line(): + type = "" + title = "" + year = "" + + def __init__(self, title, year, type): + self.type = type + self.title = title + self.year = year + + def concat(self): + line = self.title + " - " + self.year + " - (" \ + + self.type + ")" + + return(line.strip()) + +def error(): + print("Error: can't find it?") + quit() + +def get_releases(band): + + try: + ID = "" + url = ("https://www.metal-archives.com/bands/" + band.replace(" ", "_").lower()) + + html = urllib.request.urlopen(url).read() + + soup = BeautifulSoup(html, "html5lib") + + match = re.search(r'\bbandId\b.*', soup.get_text()) + + if match is not None: + tmp = match.group(0).split(" = ") + tmp = tmp[1][:-1] + ID = tmp + else: + error() + + url = "https://www.metal-archives.com/band/discography/id/" \ + + ID + "/tab/all" + + html = urllib.request.urlopen(url).read() + + soup = BeautifulSoup(html, "html5lib") + text = soup.get_text() + + raw_lines = (line.strip() for line in text.splitlines()) + + lines = [] + # get rid of ratings in lines + for line in raw_lines: + if "%" not in line: + lines.append(line) + + # temps + title = "" + year = "" + type = "" + + formatted_lines = [] + + for line in lines: + # skip whitespace / garble + if len(line) < 2: continue + + # check if year + match = re.match(r'.*[1-3][0-9]{3}', line) + if match is not None: + year = match.group(0) + + # get type + elif line == "Full-length": type = line + elif line == "EP": type = line + elif line == "Demo": type = line + elif line == "Promo": type = line + elif line == "Compilation": type = line + # elif line == "Single": type = line + # elif line == "Boxed set": type = line + + # else this must be our title + else: title = line + + if title != "" and year != "" and type != "": + formatted_lines.append(Line(title, year, type)) + title = year = type = "" + + for line in formatted_lines: + print(line.concat()) + except: + error() + +def main(): + get_releases(sys.argv[1]) + +if __name__ == "__main__": main() diff --git a/res/band.png b/res/band.png new file mode 100644 index 0000000..7ccaa0d Binary files /dev/null and b/res/band.png differ diff --git a/res/cover.png b/res/cover.png new file mode 100644 index 0000000..0746c99 Binary files /dev/null and b/res/cover.png differ diff --git a/res/genre.png b/res/genre.png new file mode 100644 index 0000000..aa4f8a7 Binary files /dev/null and b/res/genre.png differ diff --git a/res/location.png b/res/location.png new file mode 100644 index 0000000..8ca52ac Binary files /dev/null and b/res/location.png differ diff --git a/res/logo.png b/res/logo.png new file mode 100644 index 0000000..7c8c1dd Binary files /dev/null and b/res/logo.png differ diff --git a/res/lyrics.png b/res/lyrics.png new file mode 100644 index 0000000..5604101 Binary files /dev/null and b/res/lyrics.png differ diff --git a/res/releases.png b/res/releases.png new file mode 100644 index 0000000..ded284d Binary files /dev/null and b/res/releases.png differ diff --git a/res/songs.png b/res/songs.png new file mode 100644 index 0000000..23c1e35 Binary files /dev/null and b/res/songs.png differ diff --git a/songs b/songs new file mode 100755 index 0000000..c774306 --- /dev/null +++ b/songs @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# +# http://github.com/mitchweaver/bin +# +# get songs from a given 'band - album' from metal-archives.com +# +# Usage: ./songs 'ulver - bergtatt' +# + +import requests +import sys +import re +import urllib +from bs4 import BeautifulSoup +from bs4.element import Comment +import urllib.request + +def find_album_url(band, album): + + def get_soup(url): + data = requests.get(url).text + return BeautifulSoup(data, "html5lib") + + url = 'https://www.metal-archives.com/bands/' + band.lower() + soup = get_soup(url) + + for link in soup.find_all('a'): + strlink = str(link) + # scrape the band page for the discography link + # we need to do this because MA hides it behind an ID# + if 'iscography' in strlink and 'omplete' in strlink: + # sed out the href garbage + url = strlink.replace('Complete discography', '') + # scrape the album page + for link in get_soup(url).find_all('a'): + # sed out the reviews + if not 'reviews' in str(link): + # look for the album + if album.lower() in str(link).lower(): + urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link)) + return(urls[0].replace('"', '')) + +def get_songs(url): + def tag_visible(element): + if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: + return False + if isinstance(element, Comment): + return False + return True + + def text_from_html(body): + soup = BeautifulSoup(body, 'html.parser') + texts = soup.findAll(text=True) + visible_texts = filter(tag_visible, texts) + lines = [] + for line in visible_texts: + lines.append(line) + return lines + + html = urllib.request.urlopen(url).read() + + songs = [] + found = False + for line in text_from_html(html): + if 'Complete lineup' in line: + break + if '(loading lyrics...)' in line or 'Show lyrics' in line or \ + 'Single-sided' in line or 'Double-sided' in line or \ + line == '\n' or line == ' ' or line == 'instrumental' or \ + 'ompilation' in line or 'Side A' in line or 'Side B' in line: + continue + line = line.strip() + if len(line) > 0: + if found: + songs.append(line) + elif 'Additional notes' in line: + found = True + continue + + # delete last item in list, (the total time) + del songs[-1] + + count = 0 + while count < len(songs): + if ':' in songs[count]: + print(' (', end='') + print(songs[count], end='') + count += 1 + if (count % 3) == 0: + print(')\n', end='') + elif (count % 3) == 1: + print(' ', end='') + + +def main(): + arg = sys.argv[1].split(' - ') + url = find_album_url(arg[0], arg[1]) + get_songs(url) + +if __name__ == "__main__": main() diff --git a/text-scaper.py b/text-scaper.py new file mode 100755 index 0000000..a6f140a --- /dev/null +++ b/text-scaper.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +from bs4 import BeautifulSoup +from bs4.element import Comment +import urllib.request + +def tag_visible(element): + if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: + return False + if isinstance(element, Comment): + return False + return True + +def text_from_html(body): + soup = BeautifulSoup(body, 'html.parser') + texts = soup.findAll(text=True) + visible_texts = filter(tag_visible, texts) + lines = [] + for line in visible_texts: + lines.append(line) + return lines + + # return " ".join(t.strip() for t in visible_texts) + +url = 'https://www.metal-archives.com/bands/burzum' +html = urllib.request.urlopen(url).read() + +for line in text_from_html(html): + if len(line) > 2: + print(line.strip())