Extracting Text From HTML (Web Pages) in Python

 

from urllib.request import urlopen

# scenario 1: html with proper <title> tags

url = "http://olympus.realpython.org/profiles/aphrodite"
page = urlopen(url)
html_bytes = page.read()
html = html_bytes.decode("utf-8")
# OR replace the above 4 lines with the following 1 line
html = urlopen("http://olympus.realpython.org/profiles/aphrodite").read().decode("utf-8")
# Extract Title Text From HTML With String Methods
title_index = html.find("<title>")
start_index = title_index + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
print(title)



# scenario 2: html with improper <title > tags

url = "http://olympus.realpython.org/profiles/poseidon"
page = urlopen(url)
html = page.read().decode("utf-8")
start_index = html.find("<title>") + len("<title>")
end_index = html.find("</title>")
title = html[start_index:end_index]
print(title)



# scenario 3: html with improper <title > tags
# Extract Text From HTML With Regular Expressions (REGEXES)

import re
# <TITLE >Profile: Dionysus</title  / >
url = "http://olympus.realpython.org/profiles/dionysus"
page = urlopen(url)
html = page.read().decode("utf-8")
print(html)
pattern = "<title.*?>.*?</title.*?>"
match_result = re.search(pattern, html, re.IGNORECASE)
title = match_result.group()
proper_title = re.sub("<.*?>", "", title)
print(proper_title)



Comments

Popular posts from this blog

Quotation marks to wrap an element in HTML

The Basic Structure of a Full-Stack Web App

Unlocking Web Design: A Guide to Mastering CSS Layout Modes