initial commit

2018-04-23 17:56:32 +02:00
commit 39eb9205cf
18 changed files with 523 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
 *.pyc
 *.core
 text-scraping.py
 notes
 notes.md
--- a/README.md
+++ b/README.md
@@ -0,0 +1,34 @@
 # metal-archives web scrapers
 Helpful scripts to get information from http://metal-archives.com
 ### releases
 ![releases.png](res/releases.png)
 ### cover
 ![cover.png](res/cover.png)
 ### band
 ![band.png](res/band.png)
 ### logo
 ![logo.png](res/logo.png)
 ### lyrics
 ![lyrics.png](res/lyrics.png)
 ### songs
 ![songs.png](res/songs.png)
 ### genre
 ![genre.png](res/genre.png)
 ### location
 ![location.png](res/location.png)
 Originally these were methods from http://github.com/mitchweaver/diskvlt-bot
 But as they are so handy, I now use them to categorize and tag my music.
--- a/35
+++ b/35
@@ -0,0 +1,35 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get a given band pic from metal-archives.com
 #
 # Usage: ./band 'judas iscariot'
 #
 from bs4 import BeautifulSoup
 import urllib
 import sys
 def band_pic(band):
    try:
        url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
        def get_soup(url,header):
            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
        header = {'User-Agent': 'Mozilla/5.0'}
        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
        image = urllib.request.urlopen(image_urls[2]).read()
        file = open("./band.jpg", 'wb')
        file.write(image)
        file.close()
    except:
        print("Error - can't find it?")
 def main():
    band_pic(sys.argv[1])
 if __name__ == "__main__": main()
--- a/63
+++ b/63
@@ -0,0 +1,63 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get a given album cover from metal-archives.com
 #
 # Usage: ./album 'hellhammer - death fiend'
 #
 from bs4 import BeautifulSoup
 import requests
 import sys
 import re
 import urllib
 def find_album_url(band, album):
    def get_soup(url):
        data  = requests.get(url).text
        return BeautifulSoup(data, "html5lib")
    url = 'https://www.metal-archives.com/bands/' + band.lower()
    soup = get_soup(url)
    for link in soup.find_all('a'):
        strlink = str(link)
        # scrape the band page for the discography link
        # we need to do this because MA hides it behind an ID#
        if 'iscography' in strlink and 'omplete' in strlink:
            # sed out the href garbage
            url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
            # scrape the album page
            for link in get_soup(url).find_all('a'):
                # sed out the reviews
                if not 'reviews' in str(link):
                    # look for the album
                    if album.lower() in str(link).lower():
                        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
                        return(urls[0].replace('"', ''))
 def get_album_art(url):
    try:
        def get_soup(url,header):
            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
        header = {'User-Agent': 'Mozilla/5.0'}
        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
        image = urllib.request.urlopen(image_urls[1]).read()
        file = open("./cover.jpg", 'wb')
        file.write(image)
        file.close()
    except:
        print("Error - can't find it?")
 def main():
    arg = sys.argv[1].split(' - ')
    url = find_album_url(arg[0], arg[1])
    get_album_art(url)
 if __name__ == "__main__": main()
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get the genre of a given band from metal-archives
 #
 # Usage: ./genre 'darkthrone'
 #
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 import urllib.request
 import sys
 def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True
 def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    lines = []
    for line in visible_texts:
        lines.append(line)
    return lines
    # return " ".join(t.strip() for t in visible_texts)
 def parse(url):
    try:
        html = urllib.request.urlopen(url).read()
        genre = []
        found_count = 0
        for line in text_from_html(html):
            if len(line.strip()) < 2: continue
            if 'Genre:' in line:
                genre.append(line)
                found_count = 1
            elif found_count > 0:
                genre.append(line)
                found_count += 1
            if found_count == 2:
                break
        print(genre[0] + ' ' + genre[1])
    except:
        print("Can't find it?")
 def main():
    parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
 if __name__ == "__main__": main()
--- a/56
+++ b/56
@@ -0,0 +1,56 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get the location of a given band from metal-archives
 #
 # Usage: ./location 'black sabbath'
 #
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 import urllib.request
 import sys
 def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True
 def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    lines = []
    for line in visible_texts:
        lines.append(line)
    return lines
 def parse(url):
    try:
        html = urllib.request.urlopen(url.replace(' ', '_').lower()).read()
        location = []
        found_count = 0
        for line in text_from_html(html):
            if len(line.strip()) < 2: continue
            if 'Country of origin:' in line:
                location.append(line)
                found_count = 1
            elif found_count > 0:
                location.append(line)
                found_count += 1
            if found_count == 4:
                break
        print(location[0] + ' ' + location[1])
        print(location[2] + ' ' + location[3])
    except:
        print("Can't find it?")
 def main():
    parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
 if __name__ == "__main__": main()
--- a/33
+++ b/33
@@ -0,0 +1,33 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get a given band logo from metal-archives.com
 #
 from bs4 import BeautifulSoup
 import urllib
 import sys
 def band_logo(band):
    try:
        url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
        def get_soup(url,header):
            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
        header = {'User-Agent': 'Mozilla/5.0'}
        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
        image = urllib.request.urlopen(image_urls[1]).read()
        file = open("./logo.jpg", 'wb')
        file.write(image)
        file.close()
    except:
        print("Error - can't find it?")
 def main():
    band_logo(sys.argv[1])
 if __name__ == "__main__": main()
--- a/111
+++ b/111
@@ -0,0 +1,111 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get releases of a band from metal-archives
 #
 # Usage: ./releases 'iron maiden'
 #
 from bs4 import BeautifulSoup
 import requests
 import urllib
 import re
 import sys
 class Line():
    type = ""
    title = ""
    year = ""
    def __init__(self, title, year, type):
        self.type = type
        self.title = title
        self.year = year
    def concat(self):
        line = self.title + " - " + self.year + " - (" \
              + self.type + ")"
        return(line.strip())
 def error():
    print("Error: can't find it?")
    quit()
 def get_releases(band):
    try:
        ID = ""
        url = ("https://www.metal-archives.com/bands/" + band.replace(" ", "_").lower())
        html = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(html, "html5lib")
        match = re.search(r'\bbandId\b.*', soup.get_text())
        if match is not None:
            tmp = match.group(0).split(" = ")
            tmp = tmp[1][:-1]
            ID = tmp
        else:
            error()
        url = "https://www.metal-archives.com/band/discography/id/" \
                + ID + "/tab/all"
        html = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(html, "html5lib")
        text = soup.get_text()
        raw_lines = (line.strip() for line in text.splitlines())
        lines = []
        # get rid of ratings in lines
        for line in raw_lines:
            if "%" not in line:
                lines.append(line)
        # temps
        title = ""
        year = ""
        type = ""
        formatted_lines = []
        for line in lines:
            # skip whitespace / garble
            if len(line) < 2: continue
            # check if year
            match = re.match(r'.*[1-3][0-9]{3}', line)
            if match is not None:
                year = match.group(0)
            # get type
            elif line == "Full-length": type = line
            elif line == "EP": type = line
            elif line == "Demo": type = line
            elif line == "Promo": type = line
            elif line == "Compilation": type = line
            # elif line == "Single": type = line
            # elif line == "Boxed set": type = line
            # else this must be our title
            else: title = line
            if title != "" and year != "" and type != "":
                formatted_lines.append(Line(title, year, type))
                title = year = type = ""
        for line in formatted_lines:
            print(line.concat())
    except:
        error()
 def main():
    get_releases(sys.argv[1])
 if __name__ == "__main__": main()
--- a/res/band.png
+++ b/res/band.png
--- a/res/cover.png
+++ b/res/cover.png
--- a/res/genre.png
+++ b/res/genre.png
--- a/res/location.png
+++ b/res/location.png
--- a/res/logo.png
+++ b/res/logo.png
--- a/res/lyrics.png
+++ b/res/lyrics.png
--- a/res/releases.png
+++ b/res/releases.png
--- a/res/songs.png
+++ b/res/songs.png
--- a/100
+++ b/100
@@ -0,0 +1,100 @@
 #!/usr/bin/env python3
 #
 # http://github.com/mitchweaver/bin
 #
 # get songs from a given 'band - album' from metal-archives.com
 #
 # Usage: ./songs 'ulver - bergtatt'
 #
 import requests
 import sys
 import re
 import urllib
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 import urllib.request
 def find_album_url(band, album):
    def get_soup(url):
        data  = requests.get(url).text
        return BeautifulSoup(data, "html5lib")
    url = 'https://www.metal-archives.com/bands/' + band.lower()
    soup = get_soup(url)
    for link in soup.find_all('a'):
        strlink = str(link)
        # scrape the band page for the discography link
        # we need to do this because MA hides it behind an ID#
        if 'iscography' in strlink and 'omplete' in strlink:
            # sed out the href garbage
            url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
            # scrape the album page
            for link in get_soup(url).find_all('a'):
                # sed out the reviews
                if not 'reviews' in str(link):
                    # look for the album
                    if album.lower() in str(link).lower():
                        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
                        return(urls[0].replace('"', ''))
 def get_songs(url):
    def tag_visible(element):
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if isinstance(element, Comment):
            return False
        return True
    def text_from_html(body):
        soup = BeautifulSoup(body, 'html.parser')
        texts = soup.findAll(text=True)
        visible_texts = filter(tag_visible, texts)  
        lines = []
        for line in visible_texts:
            lines.append(line)
        return lines
    html = urllib.request.urlopen(url).read()
    songs = []
    found = False
    for line in text_from_html(html):
        if 'Complete lineup' in line:
            break
        if '(loading lyrics...)' in line or 'Show lyrics' in line or \
                'Single-sided' in line or 'Double-sided' in line or \
                line == '\n' or line == ' ' or line == 'instrumental' or \
                'ompilation' in line or 'Side A' in line or 'Side B' in line:
            continue
        line = line.strip()
        if len(line) > 0:
            if found:
                songs.append(line)
            elif 'Additional notes' in line:
                found = True
                continue
    # delete last item in list, (the total time)
    del songs[-1]
    count = 0
    while count < len(songs):
        if ':' in songs[count]:
            print(' (', end='')
        print(songs[count], end='')
        count += 1
        if (count % 3) == 0:
            print(')\n', end='')
        elif (count % 3) == 1:
            print(' ', end='')
 def main():
    arg = sys.argv[1].split(' - ')
    url = find_album_url(arg[0], arg[1])
    get_songs(url)
 if __name__ == "__main__": main()
--- a/text-scaper.py
+++ b/text-scaper.py
@@ -0,0 +1,29 @@
 #!/usr/bin/env python3
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 import urllib.request
 def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True
 def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    lines = []
    for line in visible_texts:
        lines.append(line)
    return lines
    # return " ".join(t.strip() for t in visible_texts)
 url = 'https://www.metal-archives.com/bands/burzum'
 html = urllib.request.urlopen(url).read()
 for line in text_from_html(html):
    if len(line) > 2:
        print(line.strip())