Extracting Text From HTML (Web Pages) in Python
from urllib.request import urlopen
# scenario 1: html with proper <title> tags
url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
# OR replace the above 4 lines with the following 1 line
html = urlopen("http://olympus.realpython.org/profiles/aphrodite").read().decode("utf-8")
# Extract Title Text From HTML With String Methods
title_index = html.find("<title>")
start_index = title_index + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
print(title)
# scenario 2: html with improper <title > tags
url = "http://olympus.realpython.org/profiles/poseidon"
page = urlopen(url)
html = page.read().decode("utf-8")
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
print(title)
# scenario 3: html with improper <title > tags
# Extract Text From HTML With Regular Expressions (REGEXES)
import re
# <TITLE >Profile: Dionysus</title / >
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
print(html)
pattern = "<title.*?>.*?</title.*?>"
match_result = re.search(pattern, html, re.IGNORECASE)
title = match_result.group()
proper_title = re.sub("<.*?>", "", title)
print(proper_title)
Comments
Post a Comment
Write something to CodeWithAbdur!