-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtmllib-example-1.py
More file actions
38 lines (31 loc) · 1.14 KB
/
Copy pathhtmllib-example-1.py
File metadata and controls
38 lines (31 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from html.parser import HTMLParser
class AnchorParser(HTMLParser):
def __init__(self):
super().__init__()
self.anchors = {}
self.current_href = None
self.current_data = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
attrs_dict = dict(attrs)
self.current_href = attrs_dict.get('href')
self.current_data = []
def handle_data(self, data):
if self.current_href is not None:
self.current_data.append(data.strip())
def handle_endtag(self, tag):
if tag == 'a' and self.current_href:
text = ' '.join(self.current_data).strip()
if text:
self.anchors.setdefault(text, []).append(self.current_href)
self.current_href = None
self.current_data = []
# Read the HTML content from file
with open("contemplate_his_majestic_personhood.html", encoding='utf-8') as file:
html = file.read()
# Parse the HTML content
parser = AnchorParser()
parser.feed(html)
# Print the extracted anchor text and associated links
for text, hrefs in parser.anchors.items():
print(f"{text} => {hrefs}")