initial commit

2018-04-23 17:56:32 +02:00
commit 39eb9205cf
18 changed files with 523 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.pyc
+*.core
+text-scraping.py
+notes
+notes.md
--- a/README.md
+++ b/README.md
@@ -0,0 +1,34 @@
+# metal-archives web scrapers
+
+Helpful scripts to get information from http://metal-archives.com
+
+### releases
+![releases.png](res/releases.png)
+
+### cover
+![cover.png](res/cover.png)
+
+### band
+![band.png](res/band.png)
+
+### logo
+![logo.png](res/logo.png)
+
+### lyrics
+![lyrics.png](res/lyrics.png)
+
+### songs
+![songs.png](res/songs.png)
+
+### genre
+![genre.png](res/genre.png)
+
+### location
+![location.png](res/location.png)
+
+
+Originally these were methods from http://github.com/mitchweaver/diskvlt-bot
+
+But as they are so handy, I now use them to categorize and tag my music.
+
+
--- a/35
+++ b/35
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get a given band pic from metal-archives.com
+#
+# Usage: ./band 'judas iscariot'
+#
+
+from bs4 import BeautifulSoup
+import urllib
+import sys
+
+def band_pic(band):
+    try:
+        url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
+
+        def get_soup(url,header):
+            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
+
+        header = {'User-Agent': 'Mozilla/5.0'}
+        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
+
+        image = urllib.request.urlopen(image_urls[2]).read()
+
+        file = open("./band.jpg", 'wb')
+        file.write(image)
+        file.close()
+    except:
+        print("Error - can't find it?")
+
+def main():
+    band_pic(sys.argv[1])
+
+if __name__ == "__main__": main()
--- a/63
+++ b/63
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get a given album cover from metal-archives.com
+#
+# Usage: ./album 'hellhammer - death fiend'
+#
+
+from bs4 import BeautifulSoup
+
+import requests
+import sys
+import re
+import urllib
+
+def find_album_url(band, album):
+
+    def get_soup(url):
+        data  = requests.get(url).text
+        return BeautifulSoup(data, "html5lib")
+
+    url = 'https://www.metal-archives.com/bands/' + band.lower()
+    soup = get_soup(url)
+
+    for link in soup.find_all('a'):
+        strlink = str(link)
+        # scrape the band page for the discography link
+        # we need to do this because MA hides it behind an ID#
+        if 'iscography' in strlink and 'omplete' in strlink:
+            # sed out the href garbage
+            url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
+            # scrape the album page
+            for link in get_soup(url).find_all('a'):
+                # sed out the reviews
+                if not 'reviews' in str(link):
+                    # look for the album
+                    if album.lower() in str(link).lower():
+                        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
+                        return(urls[0].replace('"', ''))
+
+def get_album_art(url):
+    try:
+        def get_soup(url,header):
+            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
+
+        header = {'User-Agent': 'Mozilla/5.0'}
+        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
+
+        image = urllib.request.urlopen(image_urls[1]).read()
+
+        file = open("./cover.jpg", 'wb')
+        file.write(image)
+        file.close()
+    except:
+        print("Error - can't find it?")
+
+def main():
+    arg = sys.argv[1].split(' - ')
+    url = find_album_url(arg[0], arg[1])
+    get_album_art(url)
+
+if __name__ == "__main__": main()
--- a/57
+++ b/57
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get the genre of a given band from metal-archives
+#
+# Usage: ./genre 'darkthrone'
+#
+
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+import sys
+
+def tag_visible(element):
+    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+        return False
+    if isinstance(element, Comment):
+        return False
+    return True
+
+def text_from_html(body):
+    soup = BeautifulSoup(body, 'html.parser')
+    texts = soup.findAll(text=True)
+    visible_texts = filter(tag_visible, texts)  
+    lines = []
+    for line in visible_texts:
+        lines.append(line)
+    return lines
+
+    # return " ".join(t.strip() for t in visible_texts)
+
+def parse(url):
+    try:
+        html = urllib.request.urlopen(url).read()
+
+        genre = []
+        found_count = 0
+        for line in text_from_html(html):
+            if len(line.strip()) < 2: continue
+            if 'Genre:' in line:
+                genre.append(line)
+                found_count = 1
+            elif found_count > 0:
+                genre.append(line)
+                found_count += 1
+            if found_count == 2:
+                break
+
+        print(genre[0] + ' ' + genre[1])
+    except:
+        print("Can't find it?")
+
+def main():
+    parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
+
+if __name__ == "__main__": main()
--- a/56
+++ b/56
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get the location of a given band from metal-archives
+#
+# Usage: ./location 'black sabbath'
+#
+
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+import sys
+
+def tag_visible(element):
+    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+        return False
+    if isinstance(element, Comment):
+        return False
+    return True
+
+def text_from_html(body):
+    soup = BeautifulSoup(body, 'html.parser')
+    texts = soup.findAll(text=True)
+    visible_texts = filter(tag_visible, texts)  
+    lines = []
+    for line in visible_texts:
+        lines.append(line)
+    return lines
+
+def parse(url):
+    try:
+        html = urllib.request.urlopen(url.replace(' ', '_').lower()).read()
+
+        location = []
+        found_count = 0
+        for line in text_from_html(html):
+            if len(line.strip()) < 2: continue
+            if 'Country of origin:' in line:
+                location.append(line)
+                found_count = 1
+            elif found_count > 0:
+                location.append(line)
+                found_count += 1
+            if found_count == 4:
+                break
+
+        print(location[0] + ' ' + location[1])
+        print(location[2] + ' ' + location[3])
+    except:
+        print("Can't find it?")
+
+def main():
+    parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
+
+if __name__ == "__main__": main()
--- a/33
+++ b/33
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get a given band logo from metal-archives.com
+#
+
+from bs4 import BeautifulSoup
+import urllib
+import sys
+
+def band_logo(band):
+    try:
+        url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
+
+        def get_soup(url,header):
+            return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
+
+        header = {'User-Agent': 'Mozilla/5.0'}
+        image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
+
+        image = urllib.request.urlopen(image_urls[1]).read()
+
+        file = open("./logo.jpg", 'wb')
+        file.write(image)
+        file.close()
+    except:
+        print("Error - can't find it?")
+
+def main():
+    band_logo(sys.argv[1])
+
+if __name__ == "__main__": main()
--- a/111
+++ b/111
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get releases of a band from metal-archives
+#
+# Usage: ./releases 'iron maiden'
+#
+
+from bs4 import BeautifulSoup
+import requests
+import urllib
+import re
+import sys
+
+class Line():
+    type = ""
+    title = ""
+    year = ""
+
+    def __init__(self, title, year, type):
+        self.type = type
+        self.title = title
+        self.year = year
+
+    def concat(self):
+        line = self.title + " - " + self.year + " - (" \
+              + self.type + ")"
+
+        return(line.strip())
+
+def error():
+    print("Error: can't find it?")
+    quit()
+
+def get_releases(band):
+
+    try:
+        ID = ""
+        url = ("https://www.metal-archives.com/bands/" + band.replace(" ", "_").lower())
+
+        html = urllib.request.urlopen(url).read()
+
+        soup = BeautifulSoup(html, "html5lib")
+
+        match = re.search(r'\bbandId\b.*', soup.get_text())
+
+        if match is not None:
+            tmp = match.group(0).split(" = ")
+            tmp = tmp[1][:-1]
+            ID = tmp
+        else:
+            error()
+
+        url = "https://www.metal-archives.com/band/discography/id/" \
+                + ID + "/tab/all"
+
+        html = urllib.request.urlopen(url).read()
+
+        soup = BeautifulSoup(html, "html5lib")
+        text = soup.get_text()
+
+        raw_lines = (line.strip() for line in text.splitlines())
+
+        lines = []
+        # get rid of ratings in lines
+        for line in raw_lines:
+            if "%" not in line:
+                lines.append(line)
+
+        # temps
+        title = ""
+        year = ""
+        type = ""
+
+        formatted_lines = []
+
+        for line in lines:
+            # skip whitespace / garble
+            if len(line) < 2: continue
+
+            # check if year
+            match = re.match(r'.*[1-3][0-9]{3}', line)
+            if match is not None:
+                year = match.group(0)
+
+            # get type
+            elif line == "Full-length": type = line
+            elif line == "EP": type = line
+            elif line == "Demo": type = line
+            elif line == "Promo": type = line
+            elif line == "Compilation": type = line
+            # elif line == "Single": type = line
+            # elif line == "Boxed set": type = line
+
+            # else this must be our title
+            else: title = line
+
+            if title != "" and year != "" and type != "":
+                formatted_lines.append(Line(title, year, type))
+                title = year = type = ""
+
+        for line in formatted_lines:
+            print(line.concat())
+    except:
+        error()
+
+def main():
+    get_releases(sys.argv[1])
+
+if __name__ == "__main__": main()
--- a/res/band.png
+++ b/res/band.png
--- a/res/cover.png
+++ b/res/cover.png
--- a/res/genre.png
+++ b/res/genre.png
--- a/res/location.png
+++ b/res/location.png
--- a/res/logo.png
+++ b/res/logo.png
--- a/res/lyrics.png
+++ b/res/lyrics.png
--- a/res/releases.png
+++ b/res/releases.png
--- a/res/songs.png
+++ b/res/songs.png
--- a/100
+++ b/100
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get songs from a given 'band - album' from metal-archives.com
+#
+# Usage: ./songs 'ulver - bergtatt'
+#
+
+import requests
+import sys
+import re
+import urllib
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+
+def find_album_url(band, album):
+
+    def get_soup(url):
+        data  = requests.get(url).text
+        return BeautifulSoup(data, "html5lib")
+
+    url = 'https://www.metal-archives.com/bands/' + band.lower()
+    soup = get_soup(url)
+
+    for link in soup.find_all('a'):
+        strlink = str(link)
+        # scrape the band page for the discography link
+        # we need to do this because MA hides it behind an ID#
+        if 'iscography' in strlink and 'omplete' in strlink:
+            # sed out the href garbage
+            url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
+            # scrape the album page
+            for link in get_soup(url).find_all('a'):
+                # sed out the reviews
+                if not 'reviews' in str(link):
+                    # look for the album
+                    if album.lower() in str(link).lower():
+                        urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
+                        return(urls[0].replace('"', ''))
+
+def get_songs(url):
+    def tag_visible(element):
+        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+            return False
+        if isinstance(element, Comment):
+            return False
+        return True
+
+    def text_from_html(body):
+        soup = BeautifulSoup(body, 'html.parser')
+        texts = soup.findAll(text=True)
+        visible_texts = filter(tag_visible, texts)  
+        lines = []
+        for line in visible_texts:
+            lines.append(line)
+        return lines
+
+    html = urllib.request.urlopen(url).read()
+
+    songs = []
+    found = False
+    for line in text_from_html(html):
+        if 'Complete lineup' in line:
+            break
+        if '(loading lyrics...)' in line or 'Show lyrics' in line or \
+                'Single-sided' in line or 'Double-sided' in line or \
+                line == '\n' or line == ' ' or line == 'instrumental' or \
+                'ompilation' in line or 'Side A' in line or 'Side B' in line:
+            continue
+        line = line.strip()
+        if len(line) > 0:
+            if found:
+                songs.append(line)
+            elif 'Additional notes' in line:
+                found = True
+                continue
+
+    # delete last item in list, (the total time)
+    del songs[-1]
+
+    count = 0
+    while count < len(songs):
+        if ':' in songs[count]:
+            print(' (', end='')
+        print(songs[count], end='')
+        count += 1
+        if (count % 3) == 0:
+            print(')\n', end='')
+        elif (count % 3) == 1:
+            print(' ', end='')
+
+
+def main():
+    arg = sys.argv[1].split(' - ')
+    url = find_album_url(arg[0], arg[1])
+    get_songs(url)
+
+if __name__ == "__main__": main()
--- a/text-scaper.py
+++ b/text-scaper.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+
+def tag_visible(element):
+    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+        return False
+    if isinstance(element, Comment):
+        return False
+    return True
+
+def text_from_html(body):
+    soup = BeautifulSoup(body, 'html.parser')
+    texts = soup.findAll(text=True)
+    visible_texts = filter(tag_visible, texts)  
+    lines = []
+    for line in visible_texts:
+        lines.append(line)
+    return lines
+
+    # return " ".join(t.strip() for t in visible_texts)
+
+url = 'https://www.metal-archives.com/bands/burzum'
+html = urllib.request.urlopen(url).read()
+
+for line in text_from_html(html):
+    if len(line) > 2:
+        print(line.strip())