metal-archives/songs

#!/usr/bin/env python3
#
# http://github.com/mitchweaver/metal-archives
#
# get songs from a given 'band - album' from metal-archives.com
#
# Usage: ./songs 'ulver - bergtatt'
#

import requests
import sys
import re
import urllib
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request

def find_album_url(band, album):

    def get_soup(url):
        data  = requests.get(url).text
        return BeautifulSoup(data, "html5lib")

    url = 'https://www.metal-archives.com/bands/' + band.lower()
    soup = get_soup(url)

    for link in soup.find_all('a'):
        strlink = str(link)
        # scrape the band page for the discography link
        # we need to do this because MA hides it behind an ID#
        if 'iscography' in strlink and 'omplete' in strlink:
            # sed out the href garbage
            url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
            # scrape the album page
            for link in get_soup(url).find_all('a'):
                # sed out the reviews
                if not 'reviews' in str(link):
                    # look for the album
                    if album.lower() in str(link).lower():
                        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
                        return(urls[0].replace('"', ''))

def get_songs(url):
    def tag_visible(element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True

    def text_from_html(body):
        soup = BeautifulSoup(body, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)
        lines = []
        for line in visible_texts:
            lines.append(line)
        return lines

    html = urllib.request.urlopen(url).read()

    songs = []
    found = False
    for line in text_from_html(html):
        if 'Complete lineup' in line:
            break
        if '(loading lyrics...)' in line or 'Show lyrics' in line or \
                'Single-sided' in line or 'Double-sided' in line or \
                line == '\n' or line == ' ' or line == 'instrumental' or \
                'ompilation' in line or 'Side A' in line or 'Side B' in line:
            continue
        line = line.strip()
        if len(line) > 0:
            if found:
                songs.append(line)
            elif 'Additional notes' in line:
                found = True
                continue

    # delete last item in list, (the total time)
    del songs[-1]

    count = 0
    while count < len(songs):
        if ':' in songs[count]:
            print(' (', end='')
        print(songs[count], end='')
        count += 1
        if (count % 3) == 0:
            print(')\n', end='')
        elif (count % 3) == 1:
            print(' ', end='')


def main():
    arg = sys.argv[1].split(' - ')
    url = find_album_url(arg[0], arg[1])
    get_songs(url)

if __name__ == "__main__":
    main()