Python Regular Expressions — Complete Guide with Examples

March 2026 · 18 min read · Python, Regex, Text Processing

Regular expressions are the Swiss army knife of text processing. They're ugly, powerful, and once you learn them, you'll use them everywhere — log parsing, data validation, web scraping, text cleanup. This guide covers Python's re module from basics to advanced patterns, with real examples you can actually use.

The Basics: re Module Functions

import re

text = "Order #12345 was placed on 2026-03-15 for $299.99"

# re.search — find first match anywhere in string
match = re.search(r"#(\d+)", text)
if match:
    print(match.group())   # "#12345"
    print(match.group(1))  # "12345" (captured group)

# re.match — match only at the START of string
match = re.match(r"Order", text)       # ✅ matches
match = re.match(r"placed", text)      # ❌ None (not at start)

# re.findall — find ALL matches, return list
numbers = re.findall(r"\d+", text)
print(numbers)  # ['12345', '2026', '03', '15', '299', '99']

# re.findall with groups — returns captured groups only
dates = re.findall(r"(\d{4})-(\d{2})-(\d{2})", text)
print(dates)  # [('2026', '03', '15')]

# re.sub — search and replace
cleaned = re.sub(r"\$[\d.]+", "[PRICE]", text)
print(cleaned)  # "Order #12345 was placed on 2026-03-15 for [PRICE]"

# re.split — split by pattern
parts = re.split(r"\s+", "hello   world   foo")
print(parts)  # ['hello', 'world', 'foo']
💡 Always use raw strings: Write r"\d+", not "\\d+". The r prefix prevents Python from interpreting backslashes before regex sees them.

Pattern Syntax Cheat Sheet

PatternMatchesExample
.Any character (except newline)a.c → "abc", "a1c"
\dDigit [0-9]\d{3} → "123"
\wWord char [a-zA-Z0-9_]\w+ → "hello_42"
\sWhitespace [ \t\n\r]\s+ → " \t"
\D, \W, \SNegated versions\D+ → "abc"
^Start of string^Hello
$End of stringworld$
*0 or moreab*c → "ac", "abbc"
+1 or moreab+c → "abc", "abbc"
?0 or 1colou?r → "color", "colour"
{n,m}Between n and m\d{2,4} → "12", "1234"
[abc]Character class[aeiou] → vowels
[^abc]Negated class[^0-9] → non-digits
a|bAlternation (or)cat|dog
(...)Capture group(\d{4})-(\d{2})
(?:...)Non-capturing group(?:cat|dog)s

Groups and Named Captures

import re

# --- Basic groups ---
log = '2026-03-15 14:30:22 ERROR [auth] Login failed for user@example.com'

match = re.search(
    r"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) (\w+) \[(\w+)\] (.+)",
    log
)
if match:
    date, time, level, module, message = match.groups()
    print(f"Level: {level}, Module: {module}")
    # Level: ERROR, Module: auth


# --- Named groups (?P<name>...) — much more readable ---
pattern = r"""
    (?P<date>\d{4}-\d{2}-\d{2})\s+
    (?P<time>\d{2}:\d{2}:\d{2})\s+
    (?P<level>\w+)\s+
    \[(?P<module>\w+)\]\s+
    (?P<message>.+)
"""

match = re.search(pattern, log, re.VERBOSE)
if match:
    print(match.group("level"))    # "ERROR"
    print(match.group("module"))   # "auth"
    print(match.groupdict())
    # {'date': '2026-03-15', 'time': '14:30:22', 'level': 'ERROR',
    #  'module': 'auth', 'message': 'Login failed for user@example.com'}


# --- Backreferences ---
# Find repeated words
text = "the the quick brown fox fox"
dupes = re.findall(r"\b(\w+)\s+\1\b", text)
print(dupes)  # ['the', 'fox']
💡 Use re.VERBOSE for complex patterns: The re.VERBOSE (or re.X) flag lets you add whitespace and comments inside patterns. Makes regex maintainable.

Lookaheads and Lookbehinds

Zero-width assertions — they check what's around a match without including it in the result.

import re

# --- Lookahead (?=...) — match only if followed by ---
# Find numbers followed by "px"
text = "width: 100px; height: 200em; margin: 10px"
px_values = re.findall(r"\d+(?=px)", text)
print(px_values)  # ['100', '10']

# --- Negative lookahead (?!...) — match only if NOT followed by ---
# Find numbers NOT followed by "px"
non_px = re.findall(r"\d+(?!px)\b", text)
print(non_px)  # ['200']

# --- Lookbehind (?<=...) — match only if preceded by ---
# Extract price values after "$"
prices = "Items: $29.99, €15.50, $149.00, £30"
usd = re.findall(r"(?<=\$)\d+\.\d{2}", prices)
print(usd)  # ['29.99', '149.00']

# --- Negative lookbehind (?<!...) — match only if NOT preceded by ---
# Find words not preceded by "@" (exclude mentions)
text = "hello @world from python"
words = re.findall(r"(?<!@)\b\w+\b", text)
print(words)  # ['hello', 'from', 'python']


# --- Practical: password validation with lookaheads ---
def validate_password(password: str) -> tuple[bool, list[str]]:
    """Check password strength using lookaheads."""
    errors = []
    if len(password) < 8:
        errors.append("At least 8 characters")
    if not re.search(r"(?=.*[A-Z])", password):
        errors.append("At least one uppercase letter")
    if not re.search(r"(?=.*[a-z])", password):
        errors.append("At least one lowercase letter")
    if not re.search(r"(?=.*\d)", password):
        errors.append("At least one digit")
    if not re.search(r"(?=.*[!@#$%^&*])", password):
        errors.append("At least one special character")
    return len(errors) == 0, errors

ok, issues = validate_password("Hello123!")
print(ok, issues)  # True []

ok, issues = validate_password("hello")
print(ok, issues)  # False ['At least 8 characters', 'At least one uppercase letter', ...]

Compiled Patterns (Performance)

import re

# Compile once, use many times — 2-10x faster for repeated use
EMAIL_RE = re.compile(
    r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
    re.IGNORECASE,
)

# Use the compiled pattern
emails = [
    "alice@example.com",
    "not-an-email",
    "bob@company.co.uk",
    "bad@",
]

valid = [e for e in emails if EMAIL_RE.fullmatch(e)]
print(valid)  # ['alice@example.com', 'bob@company.co.uk']


# Compile with flags
LOG_PATTERN = re.compile(r"""
    ^(?P<ip>[\d.]+)\s+        # IP address
    -\s+-\s+                     # Ident and auth (usually -)
    \[(?P<date>[^\]]+)\]\s+   # Date in brackets
    "(?P<method>\w+)\s+       # HTTP method
     (?P<path>\S+)\s+         # Request path
     HTTP/[\d.]+"               # HTTP version
    \s+(?P<status>\d{3})       # Status code
    \s+(?P<size>\d+)           # Response size
""", re.VERBOSE)

log_line = '192.168.1.1 - - [26/Mar/2026:14:30:22 +0000] "GET /api/users HTTP/1.1" 200 4523'
match = LOG_PATTERN.match(log_line)
if match:
    print(match.groupdict())
    # {'ip': '192.168.1.1', 'date': '26/Mar/2026:14:30:22 +0000',
    #  'method': 'GET', 'path': '/api/users', 'status': '200', 'size': '4523'}

Practical Patterns You'll Actually Use

Extract data from structured text

import re

# --- Parse key-value pairs ---
config_text = """
host = localhost
port = 5432
database = myapp
debug = true
"""

pairs = dict(re.findall(r"^(\w+)\s*=\s*(.+)$", config_text, re.MULTILINE))
print(pairs)
# {'host': 'localhost', 'port': '5432', 'database': 'myapp', 'debug': 'true'}


# --- Extract URLs from text ---
text = "Visit https://example.com or http://api.test.io/v2/data?key=123"
urls = re.findall(r"https?://[^\s<>\"']+", text)
print(urls)
# ['https://example.com', 'http://api.test.io/v2/data?key=123']


# --- Parse CSV-like data (handling quoted fields) ---
line = 'John,"Smith, Jr.",42,"New York"'
fields = re.findall(r'(?:"([^"]*)")|([^,]+)', line)
values = [quoted or unquoted for quoted, unquoted in fields]
print(values)  # ['John', 'Smith, Jr.', '42', 'New York']


# --- Clean HTML tags ---
html = "<p>Hello <b>world</b>, this is <a href='#'>a link</a>.</p>"
clean = re.sub(r"<[^>]+>", "", html)
print(clean)  # "Hello world, this is a link."

Data validation patterns

import re

PATTERNS = {
    # Phone: international format
    "phone": re.compile(r"^\+?1?\d{9,15}$"),

    # IPv4 address
    "ipv4": re.compile(
        r"^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}"
        r"(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$"
    ),

    # ISO date (YYYY-MM-DD)
    "date": re.compile(r"^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$"),

    # Hex color (#RGB or #RRGGBB)
    "hex_color": re.compile(r"^#(?:[0-9a-fA-F]{3}){1,2}$"),

    # Slug (URL-safe string)
    "slug": re.compile(r"^[a-z0-9]+(?:-[a-z0-9]+)*$"),

    # Semantic version
    "semver": re.compile(r"^\d+\.\d+\.\d+(?:-[\w.]+)?(?:\+[\w.]+)?$"),

    # UUID v4
    "uuid": re.compile(
        r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$",
        re.IGNORECASE,
    ),
}


def validate(value: str, pattern_name: str) -> bool:
    pattern = PATTERNS.get(pattern_name)
    if not pattern:
        raise ValueError(f"Unknown pattern: {pattern_name}")
    return bool(pattern.match(value))


# Test
assert validate("192.168.1.1", "ipv4")
assert validate("2026-03-15", "date")
assert validate("#ff6600", "hex_color")
assert validate("hello-world-42", "slug")
assert validate("2.1.0-beta.1", "semver")
assert not validate("999.1.1.1", "ipv4")
assert not validate("2026-13-01", "date")

Log parsing and data extraction

import re
from collections import Counter


def parse_nginx_logs(log_text: str) -> list[dict]:
    """Parse nginx access log into structured records."""
    pattern = re.compile(
        r'(?P<ip>[\d.]+)\s+'
        r'- - '
        r'\[(?P<date>[^\]]+)\]\s+'
        r'"(?P<method>\w+)\s+(?P<path>\S+)\s+HTTP/[\d.]+"\s+'
        r'(?P<status>\d{3})\s+'
        r'(?P<size>\d+)\s+'
        r'"(?P<referrer>[^"]*)"\s+'
        r'"(?P<user_agent>[^"]*)"'
    )

    records = []
    for line in log_text.strip().split("\n"):
        match = pattern.match(line)
        if match:
            d = match.groupdict()
            d["status"] = int(d["status"])
            d["size"] = int(d["size"])
            records.append(d)
    return records


def analyze_logs(records: list[dict]):
    """Quick analysis of parsed logs."""
    status_counts = Counter(r["status"] for r in records)
    top_paths = Counter(r["path"] for r in records).most_common(10)
    errors = [r for r in records if r["status"] >= 400]
    total_bytes = sum(r["size"] for r in records)

    return {
        "total_requests": len(records),
        "status_distribution": dict(status_counts),
        "top_paths": top_paths,
        "error_count": len(errors),
        "total_bytes": total_bytes,
    }

Advanced: re.sub with Functions

import re

# --- Replace with a function (dynamic replacements) ---
def censor_emails(text: str) -> str:
    """Replace email addresses with censored versions."""
    def mask(match):
        email = match.group()
        user, domain = email.split("@")
        return f"{user[0]}***@{domain}"

    return re.sub(
        r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
        mask,
        text,
    )

text = "Contact alice@example.com or bob.smith@company.co.uk"
print(censor_emails(text))
# "Contact a***@example.com or b***@company.co.uk"


# --- Template substitution ---
def expand_template(template: str, variables: dict) -> str:
    """Replace {{variable}} placeholders."""
    def replacer(match):
        key = match.group(1).strip()
        return str(variables.get(key, match.group()))

    return re.sub(r"\{\{\s*(\w+)\s*\}\}", replacer, template)

result = expand_template(
    "Hello {{name}}, your order #{{order_id}} is {{status}}.",
    {"name": "Alice", "order_id": "12345", "status": "shipped"},
)
print(result)  # "Hello Alice, your order #12345 is shipped."


# --- Convert camelCase to snake_case ---
def camel_to_snake(name: str) -> str:
    # Insert _ before uppercase letters, then lowercase everything
    s1 = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", name)
    return re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", s1).lower()

print(camel_to_snake("getUserById"))      # "get_user_by_id"
print(camel_to_snake("HTTPSConnection"))   # "https_connection"
print(camel_to_snake("parseJSON"))         # "parse_json"

Flags Reference

FlagShortEffect
re.IGNORECASEre.ICase-insensitive matching
re.MULTILINEre.M^ and $ match line start/end
re.DOTALLre.S. matches newlines too
re.VERBOSEre.XAllow whitespace and comments in pattern
re.ASCIIre.A\w, \d match ASCII only
# Combine flags with |
pattern = re.compile(r"""
    ^from:\s+       # Sender line
    (.+)            # Capture sender name/email
    $
""", re.VERBOSE | re.IGNORECASE | re.MULTILINE)

Common Pitfalls

# Greedy vs lazy
html = "<b>hello</b> and <b>world</b>"

# Greedy — matches too much
print(re.findall(r"<b>(.*)</b>", html))
# ['hello</b> and <b>world']  ← wrong!

# Lazy — matches correctly
print(re.findall(r"<b>(.*?)</b>", html))
# ['hello', 'world']  ← correct!

When NOT to Use Regex

Regex isn't always the answer. Python has better tools for some tasks:

🚀 Want 50+ production-ready Python scripts including text processors, scrapers, and automation tools?

Get the AI Agent Toolkit →

Related Articles

Need custom text processing or data extraction scripts? I build Python automation tools and data pipelines. Reach out on Telegram →