From cf207e7af18abf69ee9e4be21fb41efa32064cd3 Mon Sep 17 00:00:00 2001 From: Mitch Weaver <20451170+MitchWeaver@users.noreply.github.com> Date: Mon, 23 Apr 2018 18:01:54 +0200 Subject: [PATCH] fix gitignore remove --- .gitignore | 1 + text-scaper.py | 29 ----------------------------- 2 files changed, 1 insertion(+), 29 deletions(-) delete mode 100755 text-scaper.py diff --git a/.gitignore b/.gitignore index 5051b0e..cfcba2c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.pyc *.core text-scraping.py +text-scraper.py notes notes.md diff --git a/text-scaper.py b/text-scaper.py deleted file mode 100755 index a6f140a..0000000 --- a/text-scaper.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -from bs4 import BeautifulSoup -from bs4.element import Comment -import urllib.request - -def tag_visible(element): - if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: - return False - if isinstance(element, Comment): - return False - return True - -def text_from_html(body): - soup = BeautifulSoup(body, 'html.parser') - texts = soup.findAll(text=True) - visible_texts = filter(tag_visible, texts) - lines = [] - for line in visible_texts: - lines.append(line) - return lines - - # return " ".join(t.strip() for t in visible_texts) - -url = 'https://www.metal-archives.com/bands/burzum' -html = urllib.request.urlopen(url).read() - -for line in text_from_html(html): - if len(line) > 2: - print(line.strip())