Extracting Data from HTML with Regex

Using regular expressions to extract specific data from HTML content

import re

def extract_links_and_text(html):
    link_pattern = r'<a\s+(?:[^>]*?\s+)?href="([^"]*)"[^>]*>(.*?)</a>'
    matches = re.findall(link_pattern, html, re.IGNORECASE | re.DOTALL)
    return matches

html_content = """
    <h1>Sample Page</h1>
    <p>Check out these links:</p>
        <a href="https://www.example.com">Example Site</a>
        <a href="/page">Internal Page</a>
        <a href="https://www.github.com">GitHub</a>

links = extract_links_and_text(html_content)
for url, text in links:
    print(f"URL: {url}")
    print(f"Text: {text}")