initial commit
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
*.pyc
|
||||||
|
*.core
|
||||||
|
text-scraping.py
|
||||||
|
notes
|
||||||
|
notes.md
|
||||||
34
README.md
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
# metal-archives web scrapers
|
||||||
|
|
||||||
|
Helpful scripts to get information from http://metal-archives.com
|
||||||
|
|
||||||
|
### releases
|
||||||
|

|
||||||
|
|
||||||
|
### cover
|
||||||
|

|
||||||
|
|
||||||
|
### band
|
||||||
|

|
||||||
|
|
||||||
|
### logo
|
||||||
|

|
||||||
|
|
||||||
|
### lyrics
|
||||||
|

|
||||||
|
|
||||||
|
### songs
|
||||||
|

|
||||||
|
|
||||||
|
### genre
|
||||||
|

|
||||||
|
|
||||||
|
### location
|
||||||
|

|
||||||
|
|
||||||
|
|
||||||
|
Originally these were methods from http://github.com/mitchweaver/diskvlt-bot
|
||||||
|
|
||||||
|
But as they are so handy, I now use them to categorize and tag my music.
|
||||||
|
|
||||||
|
|
||||||
35
band
Executable file
@@ -0,0 +1,35 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get a given band pic from metal-archives.com
|
||||||
|
#
|
||||||
|
# Usage: ./band 'judas iscariot'
|
||||||
|
#
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def band_pic(band):
|
||||||
|
try:
|
||||||
|
url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
|
||||||
|
|
||||||
|
def get_soup(url,header):
|
||||||
|
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
|
||||||
|
|
||||||
|
header = {'User-Agent': 'Mozilla/5.0'}
|
||||||
|
image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
|
||||||
|
|
||||||
|
image = urllib.request.urlopen(image_urls[2]).read()
|
||||||
|
|
||||||
|
file = open("./band.jpg", 'wb')
|
||||||
|
file.write(image)
|
||||||
|
file.close()
|
||||||
|
except:
|
||||||
|
print("Error - can't find it?")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
band_pic(sys.argv[1])
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
63
cover
Executable file
@@ -0,0 +1,63 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get a given album cover from metal-archives.com
|
||||||
|
#
|
||||||
|
# Usage: ./album 'hellhammer - death fiend'
|
||||||
|
#
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
def find_album_url(band, album):
|
||||||
|
|
||||||
|
def get_soup(url):
|
||||||
|
data = requests.get(url).text
|
||||||
|
return BeautifulSoup(data, "html5lib")
|
||||||
|
|
||||||
|
url = 'https://www.metal-archives.com/bands/' + band.lower()
|
||||||
|
soup = get_soup(url)
|
||||||
|
|
||||||
|
for link in soup.find_all('a'):
|
||||||
|
strlink = str(link)
|
||||||
|
# scrape the band page for the discography link
|
||||||
|
# we need to do this because MA hides it behind an ID#
|
||||||
|
if 'iscography' in strlink and 'omplete' in strlink:
|
||||||
|
# sed out the href garbage
|
||||||
|
url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
|
||||||
|
# scrape the album page
|
||||||
|
for link in get_soup(url).find_all('a'):
|
||||||
|
# sed out the reviews
|
||||||
|
if not 'reviews' in str(link):
|
||||||
|
# look for the album
|
||||||
|
if album.lower() in str(link).lower():
|
||||||
|
urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
|
||||||
|
return(urls[0].replace('"', ''))
|
||||||
|
|
||||||
|
def get_album_art(url):
|
||||||
|
try:
|
||||||
|
def get_soup(url,header):
|
||||||
|
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
|
||||||
|
|
||||||
|
header = {'User-Agent': 'Mozilla/5.0'}
|
||||||
|
image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
|
||||||
|
|
||||||
|
image = urllib.request.urlopen(image_urls[1]).read()
|
||||||
|
|
||||||
|
file = open("./cover.jpg", 'wb')
|
||||||
|
file.write(image)
|
||||||
|
file.close()
|
||||||
|
except:
|
||||||
|
print("Error - can't find it?")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
arg = sys.argv[1].split(' - ')
|
||||||
|
url = find_album_url(arg[0], arg[1])
|
||||||
|
get_album_art(url)
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
57
genre
Executable file
@@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get the genre of a given band from metal-archives
|
||||||
|
#
|
||||||
|
# Usage: ./genre 'darkthrone'
|
||||||
|
#
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Comment
|
||||||
|
import urllib.request
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def tag_visible(element):
|
||||||
|
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
||||||
|
return False
|
||||||
|
if isinstance(element, Comment):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def text_from_html(body):
|
||||||
|
soup = BeautifulSoup(body, 'html.parser')
|
||||||
|
texts = soup.findAll(text=True)
|
||||||
|
visible_texts = filter(tag_visible, texts)
|
||||||
|
lines = []
|
||||||
|
for line in visible_texts:
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
# return " ".join(t.strip() for t in visible_texts)
|
||||||
|
|
||||||
|
def parse(url):
|
||||||
|
try:
|
||||||
|
html = urllib.request.urlopen(url).read()
|
||||||
|
|
||||||
|
genre = []
|
||||||
|
found_count = 0
|
||||||
|
for line in text_from_html(html):
|
||||||
|
if len(line.strip()) < 2: continue
|
||||||
|
if 'Genre:' in line:
|
||||||
|
genre.append(line)
|
||||||
|
found_count = 1
|
||||||
|
elif found_count > 0:
|
||||||
|
genre.append(line)
|
||||||
|
found_count += 1
|
||||||
|
if found_count == 2:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(genre[0] + ' ' + genre[1])
|
||||||
|
except:
|
||||||
|
print("Can't find it?")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
56
location
Executable file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get the location of a given band from metal-archives
|
||||||
|
#
|
||||||
|
# Usage: ./location 'black sabbath'
|
||||||
|
#
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Comment
|
||||||
|
import urllib.request
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def tag_visible(element):
|
||||||
|
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
||||||
|
return False
|
||||||
|
if isinstance(element, Comment):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def text_from_html(body):
|
||||||
|
soup = BeautifulSoup(body, 'html.parser')
|
||||||
|
texts = soup.findAll(text=True)
|
||||||
|
visible_texts = filter(tag_visible, texts)
|
||||||
|
lines = []
|
||||||
|
for line in visible_texts:
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
def parse(url):
|
||||||
|
try:
|
||||||
|
html = urllib.request.urlopen(url.replace(' ', '_').lower()).read()
|
||||||
|
|
||||||
|
location = []
|
||||||
|
found_count = 0
|
||||||
|
for line in text_from_html(html):
|
||||||
|
if len(line.strip()) < 2: continue
|
||||||
|
if 'Country of origin:' in line:
|
||||||
|
location.append(line)
|
||||||
|
found_count = 1
|
||||||
|
elif found_count > 0:
|
||||||
|
location.append(line)
|
||||||
|
found_count += 1
|
||||||
|
if found_count == 4:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(location[0] + ' ' + location[1])
|
||||||
|
print(location[2] + ' ' + location[3])
|
||||||
|
except:
|
||||||
|
print("Can't find it?")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parse(url = 'https://www.metal-archives.com/bands/' + sys.argv[1])
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
33
logo
Executable file
@@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get a given band logo from metal-archives.com
|
||||||
|
#
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import urllib
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def band_logo(band):
|
||||||
|
try:
|
||||||
|
url = 'https://www.metal-archives.com/bands/' + band.replace(' ', '_').lower()
|
||||||
|
|
||||||
|
def get_soup(url,header):
|
||||||
|
return BeautifulSoup(urllib.request.urlopen(urllib.request.Request(url,headers=header)), "html5lib")
|
||||||
|
|
||||||
|
header = {'User-Agent': 'Mozilla/5.0'}
|
||||||
|
image_urls = [a['src'] for a in get_soup(url, header).find_all("img")]
|
||||||
|
|
||||||
|
image = urllib.request.urlopen(image_urls[1]).read()
|
||||||
|
|
||||||
|
file = open("./logo.jpg", 'wb')
|
||||||
|
file.write(image)
|
||||||
|
file.close()
|
||||||
|
except:
|
||||||
|
print("Error - can't find it?")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
band_logo(sys.argv[1])
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
111
releases
Executable file
@@ -0,0 +1,111 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get releases of a band from metal-archives
|
||||||
|
#
|
||||||
|
# Usage: ./releases 'iron maiden'
|
||||||
|
#
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
import urllib
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
|
||||||
|
class Line():
|
||||||
|
type = ""
|
||||||
|
title = ""
|
||||||
|
year = ""
|
||||||
|
|
||||||
|
def __init__(self, title, year, type):
|
||||||
|
self.type = type
|
||||||
|
self.title = title
|
||||||
|
self.year = year
|
||||||
|
|
||||||
|
def concat(self):
|
||||||
|
line = self.title + " - " + self.year + " - (" \
|
||||||
|
+ self.type + ")"
|
||||||
|
|
||||||
|
return(line.strip())
|
||||||
|
|
||||||
|
def error():
|
||||||
|
print("Error: can't find it?")
|
||||||
|
quit()
|
||||||
|
|
||||||
|
def get_releases(band):
|
||||||
|
|
||||||
|
try:
|
||||||
|
ID = ""
|
||||||
|
url = ("https://www.metal-archives.com/bands/" + band.replace(" ", "_").lower())
|
||||||
|
|
||||||
|
html = urllib.request.urlopen(url).read()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
|
||||||
|
match = re.search(r'\bbandId\b.*', soup.get_text())
|
||||||
|
|
||||||
|
if match is not None:
|
||||||
|
tmp = match.group(0).split(" = ")
|
||||||
|
tmp = tmp[1][:-1]
|
||||||
|
ID = tmp
|
||||||
|
else:
|
||||||
|
error()
|
||||||
|
|
||||||
|
url = "https://www.metal-archives.com/band/discography/id/" \
|
||||||
|
+ ID + "/tab/all"
|
||||||
|
|
||||||
|
html = urllib.request.urlopen(url).read()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(html, "html5lib")
|
||||||
|
text = soup.get_text()
|
||||||
|
|
||||||
|
raw_lines = (line.strip() for line in text.splitlines())
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
# get rid of ratings in lines
|
||||||
|
for line in raw_lines:
|
||||||
|
if "%" not in line:
|
||||||
|
lines.append(line)
|
||||||
|
|
||||||
|
# temps
|
||||||
|
title = ""
|
||||||
|
year = ""
|
||||||
|
type = ""
|
||||||
|
|
||||||
|
formatted_lines = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# skip whitespace / garble
|
||||||
|
if len(line) < 2: continue
|
||||||
|
|
||||||
|
# check if year
|
||||||
|
match = re.match(r'.*[1-3][0-9]{3}', line)
|
||||||
|
if match is not None:
|
||||||
|
year = match.group(0)
|
||||||
|
|
||||||
|
# get type
|
||||||
|
elif line == "Full-length": type = line
|
||||||
|
elif line == "EP": type = line
|
||||||
|
elif line == "Demo": type = line
|
||||||
|
elif line == "Promo": type = line
|
||||||
|
elif line == "Compilation": type = line
|
||||||
|
# elif line == "Single": type = line
|
||||||
|
# elif line == "Boxed set": type = line
|
||||||
|
|
||||||
|
# else this must be our title
|
||||||
|
else: title = line
|
||||||
|
|
||||||
|
if title != "" and year != "" and type != "":
|
||||||
|
formatted_lines.append(Line(title, year, type))
|
||||||
|
title = year = type = ""
|
||||||
|
|
||||||
|
for line in formatted_lines:
|
||||||
|
print(line.concat())
|
||||||
|
except:
|
||||||
|
error()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
get_releases(sys.argv[1])
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
BIN
res/band.png
Normal file
|
After Width: | Height: | Size: 128 KiB |
BIN
res/cover.png
Normal file
|
After Width: | Height: | Size: 125 KiB |
BIN
res/genre.png
Normal file
|
After Width: | Height: | Size: 6.4 KiB |
BIN
res/location.png
Normal file
|
After Width: | Height: | Size: 8.0 KiB |
BIN
res/logo.png
Normal file
|
After Width: | Height: | Size: 39 KiB |
BIN
res/lyrics.png
Normal file
|
After Width: | Height: | Size: 29 KiB |
BIN
res/releases.png
Normal file
|
After Width: | Height: | Size: 12 KiB |
BIN
res/songs.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
100
songs
Executable file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
#
|
||||||
|
# http://github.com/mitchweaver/bin
|
||||||
|
#
|
||||||
|
# get songs from a given 'band - album' from metal-archives.com
|
||||||
|
#
|
||||||
|
# Usage: ./songs 'ulver - bergtatt'
|
||||||
|
#
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import urllib
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Comment
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
def find_album_url(band, album):
|
||||||
|
|
||||||
|
def get_soup(url):
|
||||||
|
data = requests.get(url).text
|
||||||
|
return BeautifulSoup(data, "html5lib")
|
||||||
|
|
||||||
|
url = 'https://www.metal-archives.com/bands/' + band.lower()
|
||||||
|
soup = get_soup(url)
|
||||||
|
|
||||||
|
for link in soup.find_all('a'):
|
||||||
|
strlink = str(link)
|
||||||
|
# scrape the band page for the discography link
|
||||||
|
# we need to do this because MA hides it behind an ID#
|
||||||
|
if 'iscography' in strlink and 'omplete' in strlink:
|
||||||
|
# sed out the href garbage
|
||||||
|
url = strlink.replace('<a href="', '').replace('"><span>Complete discography</span></a>', '')
|
||||||
|
# scrape the album page
|
||||||
|
for link in get_soup(url).find_all('a'):
|
||||||
|
# sed out the reviews
|
||||||
|
if not 'reviews' in str(link):
|
||||||
|
# look for the album
|
||||||
|
if album.lower() in str(link).lower():
|
||||||
|
urls = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+/.*."', str(link))
|
||||||
|
return(urls[0].replace('"', ''))
|
||||||
|
|
||||||
|
def get_songs(url):
|
||||||
|
def tag_visible(element):
|
||||||
|
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
||||||
|
return False
|
||||||
|
if isinstance(element, Comment):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def text_from_html(body):
|
||||||
|
soup = BeautifulSoup(body, 'html.parser')
|
||||||
|
texts = soup.findAll(text=True)
|
||||||
|
visible_texts = filter(tag_visible, texts)
|
||||||
|
lines = []
|
||||||
|
for line in visible_texts:
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
html = urllib.request.urlopen(url).read()
|
||||||
|
|
||||||
|
songs = []
|
||||||
|
found = False
|
||||||
|
for line in text_from_html(html):
|
||||||
|
if 'Complete lineup' in line:
|
||||||
|
break
|
||||||
|
if '(loading lyrics...)' in line or 'Show lyrics' in line or \
|
||||||
|
'Single-sided' in line or 'Double-sided' in line or \
|
||||||
|
line == '\n' or line == ' ' or line == 'instrumental' or \
|
||||||
|
'ompilation' in line or 'Side A' in line or 'Side B' in line:
|
||||||
|
continue
|
||||||
|
line = line.strip()
|
||||||
|
if len(line) > 0:
|
||||||
|
if found:
|
||||||
|
songs.append(line)
|
||||||
|
elif 'Additional notes' in line:
|
||||||
|
found = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
# delete last item in list, (the total time)
|
||||||
|
del songs[-1]
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
while count < len(songs):
|
||||||
|
if ':' in songs[count]:
|
||||||
|
print(' (', end='')
|
||||||
|
print(songs[count], end='')
|
||||||
|
count += 1
|
||||||
|
if (count % 3) == 0:
|
||||||
|
print(')\n', end='')
|
||||||
|
elif (count % 3) == 1:
|
||||||
|
print(' ', end='')
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
arg = sys.argv[1].split(' - ')
|
||||||
|
url = find_album_url(arg[0], arg[1])
|
||||||
|
get_songs(url)
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
||||||
29
text-scaper.py
Executable file
@@ -0,0 +1,29 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Comment
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
def tag_visible(element):
|
||||||
|
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
|
||||||
|
return False
|
||||||
|
if isinstance(element, Comment):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def text_from_html(body):
|
||||||
|
soup = BeautifulSoup(body, 'html.parser')
|
||||||
|
texts = soup.findAll(text=True)
|
||||||
|
visible_texts = filter(tag_visible, texts)
|
||||||
|
lines = []
|
||||||
|
for line in visible_texts:
|
||||||
|
lines.append(line)
|
||||||
|
return lines
|
||||||
|
|
||||||
|
# return " ".join(t.strip() for t in visible_texts)
|
||||||
|
|
||||||
|
url = 'https://www.metal-archives.com/bands/burzum'
|
||||||
|
html = urllib.request.urlopen(url).read()
|
||||||
|
|
||||||
|
for line in text_from_html(html):
|
||||||
|
if len(line) > 2:
|
||||||
|
print(line.strip())
|
||||||