commit 39eb9205cf2e578d3e369613cd37b3114d7ed3bb
Author: Mitch Weaver <20451170+MitchWeaver@users.noreply.github.com>
Date: Mon Apr 23 17:56:32 2018 +0200
initial commit
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5051b0e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+*.pyc
+*.core
+text-scraping.py
+notes
+notes.md
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7d3decf
--- /dev/null
+++ b/README.md
@@ -0,0 +1,34 @@
+# metal-archives web scrapers
+
+Helpful scripts to get information from http://metal-archives.com
+
+### releases
+
+
+### cover
+
+
+### band
+
+
+### logo
+
+
+### lyrics
+
+
+### songs
+
+
+### genre
+
+
+### location
+
+
+
+Originally these were methods from http://github.com/mitchweaver/diskvlt-bot
+
+But as they are so handy, I now use them to categorize and tag my music.
+
+
diff --git a/band b/band
new file mode 100755
index 0000000..e15828c
--- /dev/null
+++ b/band
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get a given band pic from metal-archives.com
+#
+# Usage: ./band 'judas iscariot'
+#
+
+from bs4 import BeautifulSoup
+import urllib
+import sys
+
+def band_pic(band):
+ try:
+ url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
+
+ def get_soup(url,header):
+ return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
+
+ header = {'User-Agent': 'Mozilla/5.0'}
+ image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
+
+ image = urllib.request.urlopen(image_urls[2]).read()
+
+ file = open("./band.jpg", 'wb')
+ file.write(image)
+ file.close()
+ except:
+ print("Error - can't find it?")
+
+def main():
+ band_pic(sys.argv[1])
+
+if __name__ == "__main__": main()
diff --git a/cover b/cover
new file mode 100755
index 0000000..e568f70
--- /dev/null
+++ b/cover
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get a given album cover from metal-archives.com
+#
+# Usage: ./album 'hellhammer - death fiend'
+#
+
+from bs4 import BeautifulSoup
+
+import requests
+import sys
+import re
+import urllib
+
+def find_album_url(band, album):
+
+ def get_soup(url):
+ data = requests.get(url).text
+ return BeautifulSoup(data, "html5lib")
+
+ url = 'https://www.metal-archives.com/bands/' + band.lower()
+ soup = get_soup(url)
+
+ for link in soup.find_all('a'):
+ strlink = str(link)
+ # scrape the band page for the discography link
+ # we need to do this because MA hides it behind an ID#
+ if 'iscography' in strlink and 'omplete' in strlink:
+ # sed out the href garbage
+ url = strlink.replace('Complete discography', '')
+ # scrape the album page
+ for link in get_soup(url).find_all('a'):
+ # sed out the reviews
+ if not 'reviews' in str(link):
+ # look for the album
+ if album.lower() in str(link).lower():
+ urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
+ return(urls[0].replace('"', ''))
+
+def get_album_art(url):
+ try:
+ def get_soup(url,header):
+ return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
+
+ header = {'User-Agent': 'Mozilla/5.0'}
+ image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
+
+ image = urllib.request.urlopen(image_urls[1]).read()
+
+ file = open("./cover.jpg", 'wb')
+ file.write(image)
+ file.close()
+ except:
+ print("Error - can't find it?")
+
+def main():
+ arg = sys.argv[1].split(' - ')
+ url = find_album_url(arg[0], arg[1])
+ get_album_art(url)
+
+if __name__ == "__main__": main()
diff --git a/genre b/genre
new file mode 100755
index 0000000..be563bd
--- /dev/null
+++ b/genre
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get the genre of a given band from metal-archives
+#
+# Usage: ./genre 'darkthrone'
+#
+
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+import sys
+
+def tag_visible(element):
+ if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+ return False
+ if isinstance(element, Comment):
+ return False
+ return True
+
+def text_from_html(body):
+ soup = BeautifulSoup(body, 'html.parser')
+ texts = soup.findAll(text=True)
+ visible_texts = filter(tag_visible, texts)
+ lines = []
+ for line in visible_texts:
+ lines.append(line)
+ return lines
+
+ # return " ".join(t.strip() for t in visible_texts)
+
+def parse(url):
+ try:
+ html = urllib.request.urlopen(url).read()
+
+ genre = []
+ found_count = 0
+ for line in text_from_html(html):
+ if len(line.strip()) < 2: continue
+ if 'Genre:' in line:
+ genre.append(line)
+ found_count = 1
+ elif found_count > 0:
+ genre.append(line)
+ found_count += 1
+ if found_count == 2:
+ break
+
+ print(genre[0] + ' ' + genre[1])
+ except:
+ print("Can't find it?")
+
+def main():
+ parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
+
+if __name__ == "__main__": main()
diff --git a/location b/location
new file mode 100755
index 0000000..e8cbd83
--- /dev/null
+++ b/location
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get the location of a given band from metal-archives
+#
+# Usage: ./location 'black sabbath'
+#
+
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+import sys
+
+def tag_visible(element):
+ if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+ return False
+ if isinstance(element, Comment):
+ return False
+ return True
+
+def text_from_html(body):
+ soup = BeautifulSoup(body, 'html.parser')
+ texts = soup.findAll(text=True)
+ visible_texts = filter(tag_visible, texts)
+ lines = []
+ for line in visible_texts:
+ lines.append(line)
+ return lines
+
+def parse(url):
+ try:
+ html = urllib.request.urlopen(url.replace(' ', '_').lower()).read()
+
+ location = []
+ found_count = 0
+ for line in text_from_html(html):
+ if len(line.strip()) < 2: continue
+ if 'Country of origin:' in line:
+ location.append(line)
+ found_count = 1
+ elif found_count > 0:
+ location.append(line)
+ found_count += 1
+ if found_count == 4:
+ break
+
+ print(location[0] + ' ' + location[1])
+ print(location[2] + ' ' + location[3])
+ except:
+ print("Can't find it?")
+
+def main():
+ parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
+
+if __name__ == "__main__": main()
diff --git a/logo b/logo
new file mode 100755
index 0000000..34828a1
--- /dev/null
+++ b/logo
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get a given band logo from metal-archives.com
+#
+
+from bs4 import BeautifulSoup
+import urllib
+import sys
+
+def band_logo(band):
+ try:
+ url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
+
+ def get_soup(url,header):
+ return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
+
+ header = {'User-Agent': 'Mozilla/5.0'}
+ image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
+
+ image = urllib.request.urlopen(image_urls[1]).read()
+
+ file = open("./logo.jpg", 'wb')
+ file.write(image)
+ file.close()
+ except:
+ print("Error - can't find it?")
+
+def main():
+ band_logo(sys.argv[1])
+
+if __name__ == "__main__": main()
diff --git a/releases b/releases
new file mode 100755
index 0000000..be3be27
--- /dev/null
+++ b/releases
@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get releases of a band from metal-archives
+#
+# Usage: ./releases 'iron maiden'
+#
+
+from bs4 import BeautifulSoup
+import requests
+import urllib
+import re
+import sys
+
+class Line():
+ type = ""
+ title = ""
+ year = ""
+
+ def __init__(self, title, year, type):
+ self.type = type
+ self.title = title
+ self.year = year
+
+ def concat(self):
+ line = self.title + " - " + self.year + " - (" \
+ + self.type + ")"
+
+ return(line.strip())
+
+def error():
+ print("Error: can't find it?")
+ quit()
+
+def get_releases(band):
+
+ try:
+ ID = ""
+ url = ("https://www.metal-archives.com/bands/" + band.replace(" ", "_").lower())
+
+ html = urllib.request.urlopen(url).read()
+
+ soup = BeautifulSoup(html, "html5lib")
+
+ match = re.search(r'\bbandId\b.*', soup.get_text())
+
+ if match is not None:
+ tmp = match.group(0).split(" = ")
+ tmp = tmp[1][:-1]
+ ID = tmp
+ else:
+ error()
+
+ url = "https://www.metal-archives.com/band/discography/id/" \
+ + ID + "/tab/all"
+
+ html = urllib.request.urlopen(url).read()
+
+ soup = BeautifulSoup(html, "html5lib")
+ text = soup.get_text()
+
+ raw_lines = (line.strip() for line in text.splitlines())
+
+ lines = []
+ # get rid of ratings in lines
+ for line in raw_lines:
+ if "%" not in line:
+ lines.append(line)
+
+ # temps
+ title = ""
+ year = ""
+ type = ""
+
+ formatted_lines = []
+
+ for line in lines:
+ # skip whitespace / garble
+ if len(line) < 2: continue
+
+ # check if year
+ match = re.match(r'.*[1-3][0-9]{3}', line)
+ if match is not None:
+ year = match.group(0)
+
+ # get type
+ elif line == "Full-length": type = line
+ elif line == "EP": type = line
+ elif line == "Demo": type = line
+ elif line == "Promo": type = line
+ elif line == "Compilation": type = line
+ # elif line == "Single": type = line
+ # elif line == "Boxed set": type = line
+
+ # else this must be our title
+ else: title = line
+
+ if title != "" and year != "" and type != "":
+ formatted_lines.append(Line(title, year, type))
+ title = year = type = ""
+
+ for line in formatted_lines:
+ print(line.concat())
+ except:
+ error()
+
+def main():
+ get_releases(sys.argv[1])
+
+if __name__ == "__main__": main()
diff --git a/res/band.png b/res/band.png
new file mode 100644
index 0000000..7ccaa0d
Binary files /dev/null and b/res/band.png differ
diff --git a/res/cover.png b/res/cover.png
new file mode 100644
index 0000000..0746c99
Binary files /dev/null and b/res/cover.png differ
diff --git a/res/genre.png b/res/genre.png
new file mode 100644
index 0000000..aa4f8a7
Binary files /dev/null and b/res/genre.png differ
diff --git a/res/location.png b/res/location.png
new file mode 100644
index 0000000..8ca52ac
Binary files /dev/null and b/res/location.png differ
diff --git a/res/logo.png b/res/logo.png
new file mode 100644
index 0000000..7c8c1dd
Binary files /dev/null and b/res/logo.png differ
diff --git a/res/lyrics.png b/res/lyrics.png
new file mode 100644
index 0000000..5604101
Binary files /dev/null and b/res/lyrics.png differ
diff --git a/res/releases.png b/res/releases.png
new file mode 100644
index 0000000..ded284d
Binary files /dev/null and b/res/releases.png differ
diff --git a/res/songs.png b/res/songs.png
new file mode 100644
index 0000000..23c1e35
Binary files /dev/null and b/res/songs.png differ
diff --git a/songs b/songs
new file mode 100755
index 0000000..c774306
--- /dev/null
+++ b/songs
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+#
+# http://github.com/mitchweaver/bin
+#
+# get songs from a given 'band - album' from metal-archives.com
+#
+# Usage: ./songs 'ulver - bergtatt'
+#
+
+import requests
+import sys
+import re
+import urllib
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+
+def find_album_url(band, album):
+
+ def get_soup(url):
+ data = requests.get(url).text
+ return BeautifulSoup(data, "html5lib")
+
+ url = 'https://www.metal-archives.com/bands/' + band.lower()
+ soup = get_soup(url)
+
+ for link in soup.find_all('a'):
+ strlink = str(link)
+ # scrape the band page for the discography link
+ # we need to do this because MA hides it behind an ID#
+ if 'iscography' in strlink and 'omplete' in strlink:
+ # sed out the href garbage
+ url = strlink.replace('Complete discography', '')
+ # scrape the album page
+ for link in get_soup(url).find_all('a'):
+ # sed out the reviews
+ if not 'reviews' in str(link):
+ # look for the album
+ if album.lower() in str(link).lower():
+ urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
+ return(urls[0].replace('"', ''))
+
+def get_songs(url):
+ def tag_visible(element):
+ if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+ return False
+ if isinstance(element, Comment):
+ return False
+ return True
+
+ def text_from_html(body):
+ soup = BeautifulSoup(body, 'html.parser')
+ texts = soup.findAll(text=True)
+ visible_texts = filter(tag_visible, texts)
+ lines = []
+ for line in visible_texts:
+ lines.append(line)
+ return lines
+
+ html = urllib.request.urlopen(url).read()
+
+ songs = []
+ found = False
+ for line in text_from_html(html):
+ if 'Complete lineup' in line:
+ break
+ if '(loading lyrics...)' in line or 'Show lyrics' in line or \
+ 'Single-sided' in line or 'Double-sided' in line or \
+ line == '\n' or line == ' ' or line == 'instrumental' or \
+ 'ompilation' in line or 'Side A' in line or 'Side B' in line:
+ continue
+ line = line.strip()
+ if len(line) > 0:
+ if found:
+ songs.append(line)
+ elif 'Additional notes' in line:
+ found = True
+ continue
+
+ # delete last item in list, (the total time)
+ del songs[-1]
+
+ count = 0
+ while count < len(songs):
+ if ':' in songs[count]:
+ print(' (', end='')
+ print(songs[count], end='')
+ count += 1
+ if (count % 3) == 0:
+ print(')\n', end='')
+ elif (count % 3) == 1:
+ print(' ', end='')
+
+
+def main():
+ arg = sys.argv[1].split(' - ')
+ url = find_album_url(arg[0], arg[1])
+ get_songs(url)
+
+if __name__ == "__main__": main()
diff --git a/text-scaper.py b/text-scaper.py
new file mode 100755
index 0000000..a6f140a
--- /dev/null
+++ b/text-scaper.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+from bs4 import BeautifulSoup
+from bs4.element import Comment
+import urllib.request
+
+def tag_visible(element):
+ if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
+ return False
+ if isinstance(element, Comment):
+ return False
+ return True
+
+def text_from_html(body):
+ soup = BeautifulSoup(body, 'html.parser')
+ texts = soup.findAll(text=True)
+ visible_texts = filter(tag_visible, texts)
+ lines = []
+ for line in visible_texts:
+ lines.append(line)
+ return lines
+
+ # return " ".join(t.strip() for t in visible_texts)
+
+url = 'https://www.metal-archives.com/bands/burzum'
+html = urllib.request.urlopen(url).read()
+
+for line in text_from_html(html):
+ if len(line) > 2:
+ print(line.strip())