metal-archives/cover

#!/usr/bin/env python3
#
# http://github.com/mitchweaver/bin
#
# get a given album cover from metal-archives.com
#
# Usage: ./album 'hellhammer - death fiend'
#

from bs4 import BeautifulSoup

import requests
import sys
import re
import urllib

def find_album_url(band, album):

    def get_soup(url):
        data  = requests.get(url).text
        return BeautifulSoup(data, "html5lib")

    url = 'https://www.metal-archives.com/bands/' + band.lower()
    soup = get_soup(url)

    for link in soup.find_all('a'):
        strlink = str(link)
        # scrape the band page for the discography link
        # we need to do this because MA hides it behind an ID#
        if 'iscography' in strlink and 'omplete' in strlink:
            # sed out the href garbage
            url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
            # scrape the album page
            for link in get_soup(url).find_all('a'):
                # sed out the reviews
                if not 'reviews' in str(link):
                    # look for the album
                    if album.lower() in str(link).lower():
                        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
                        return(urls[0].replace('"', ''))

def get_album_art(url):
    try:
        def get_soup(url,header):
            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")

        header = {'User-Agent': 'Mozilla/5.0'}
        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]

        image = urllib.request.urlopen(image_urls[1]).read()

        file = open("./cover.jpg", 'wb')
        file.write(image)
        file.close()
    except:
        print("Error - can't find it?")

def main():
    arg = sys.argv[1].split(' - ')
    url = find_album_url(arg[0], arg[1])
    get_album_art(url)

if __name__ == "__main__": main()