Python dataclass vs Pydantic:数据类选型指南

dataclass 是标准库的轻量选择,Pydantic v2 是带验证的重武器,什么时候用哪个,这篇说清楚。

$1.3k 字/约 8 min👁— views

Python dataclass vs Pydantic:数据类选型指南

一句话总结

  • dataclass:标准库,零依赖,适合内部数据结构
  • Pydantic v2:带验证 + 序列化,适合处理外部数据(API/配置)

dataclass 核心用法

from dataclasses import dataclass, field
from typing import ClassVar

@dataclass
class User:
    name: str
    age: int
    email: str = ""
    tags: list[str] = field(default_factory=list)

    # ClassVar 不会成为实例字段
    table_name: ClassVar[str] = "users"

# 创建实例
u = User("Alice", 30, "alice@example.com", ["admin"])
print(u)  # User(name='Alice', age=30, email='alice@example.com', tags=['admin'])

# 比较(自动实现 __eq__)
u2 = User("Alice", 30, "alice@example.com", ["admin"])
print(u == u2)  # True

post_init:初始化后处理

from dataclasses import dataclass
import re

@dataclass
class Email:
    address: str

    def __post_init__(self):
        if not re.match(r"[^@]+@[^@]+\.[^@]+", self.address):
            raise ValueError(f"Invalid email: {self.address}")
        self.address = self.address.lower()  # 统一小写

e = Email("Alice@Example.COM")
print(e.address)  # alice@example.com

try:
    Email("not-an-email")
except ValueError as err:
    print(err)  # Invalid email: not-an-email

frozen=True:不可变数据类

from dataclasses import dataclass

@dataclass(frozen=True)
class Point:
    x: float
    y: float

    def distance(self) -> float:
        return (self.x ** 2 + self.y ** 2) ** 0.5

p = Point(3.0, 4.0)
print(p.distance())  # 5.0
# p.x = 1.0  # FrozenInstanceError

# frozen=True 的 dataclass 可以作为 dict key 或放入 set
points = {p, Point(1.0, 0.0)}

dataclass 的局限

from dataclasses import dataclass

@dataclass
class Order:
    price: float
    quantity: int

# 1. 没有运行时验证
o = Order(price="not a float", quantity="abc")  # 不报错!
print(o)  # Order(price='not a float', quantity='abc')

# 2. JSON 序列化要自己写
import json
from dataclasses import asdict

o2 = Order(99.9, 2)
print(json.dumps(asdict(o2)))  # {"price": 99.9, "quantity": 2}
# 但嵌套复杂类型时 asdict 可能不够用

# 3. 从 dict 反序列化没有内置方法
data = {"price": 99.9, "quantity": 2}
o3 = Order(**data)  # 手动解包,无验证

Pydantic v2 核心用法

# pip install pydantic
from pydantic import BaseModel, Field, field_validator
from typing import Annotated

class User(BaseModel):
    name: str
    age: int = Field(ge=0, le=150)  # ge=大于等于, le=小于等于
    email: str = ""
    tags: list[str] = []

# 自动验证
u = User(name="Alice", age=30, email="alice@example.com")
print(u)
# name='Alice' age=30 email='alice@example.com' tags=[]

# 类型强制转换
u2 = User(name="Bob", age="25")  # "25" 自动转为 int
print(u2.age, type(u2.age))  # 25 <class 'int'>

# 验证失败
try:
    User(name="Eve", age=200)  # 超过 le=150
except Exception as e:
    print(e)
# 1 validation error for User
# age: Input should be less than or equal to 150

Pydantic v2 性能

Pydantic v2 用 Rust 重写了核心逻辑(pydantic-core),比 v1 快 5-50 倍:

pip install pydantic  # 默认安装 v2

python -c "import pydantic; print(pydantic.__version__)"  # 2.x.x

validators:字段验证

from pydantic import BaseModel, field_validator, model_validator
import re

class SignupForm(BaseModel):
    username: str
    password: str
    confirm_password: str
    email: str

    @field_validator("username")
    @classmethod
    def username_valid(cls, v: str) -> str:
        if not re.match(r"^[a-zA-Z0-9_]{3,20}$", v):
            raise ValueError("用户名只能包含字母数字下划线,3-20字符")
        return v.lower()

    @field_validator("email")
    @classmethod
    def email_valid(cls, v: str) -> str:
        if "@" not in v:
            raise ValueError("邮箱格式错误")
        return v.lower()

    @model_validator(mode="after")
    def passwords_match(self) -> "SignupForm":
        if self.password != self.confirm_password:
            raise ValueError("两次密码不一致")
        return self

# 测试
try:
    SignupForm(
        username="alice",
        password="secret123",
        confirm_password="wrong",
        email="alice@example.com",
    )
except Exception as e:
    print(e)
# passwords_match: 两次密码不一致

嵌套模型和列表验证

from pydantic import BaseModel

class Address(BaseModel):
    street: str
    city: str
    zip_code: str

class Order(BaseModel):
    id: int
    items: list[str]
    address: Address
    total: float

# 从嵌套 dict 创建
data = {
    "id": 1,
    "items": ["book", "pen"],
    "address": {"street": "123 Main St", "city": "Shanghai", "zip_code": "200000"},
    "total": 49.99,
}
order = Order(**data)
print(order.address.city)  # Shanghai

# 序列化
print(order.model_dump())
print(order.model_dump_json())

model_config:行为配置

from pydantic import BaseModel, ConfigDict

class StrictUser(BaseModel):
    model_config = ConfigDict(strict=True)  # 不允许类型强制转换

    age: int

# strict 模式下,"25" 不会自动转为 int
try:
    StrictUser(age="25")
except Exception as e:
    print(e)  # age: Input should be a valid integer, got a string

# from_attributes=True:支持从 ORM 对象创建(替代 v1 的 orm_mode)
class UserSchema(BaseModel):
    model_config = ConfigDict(from_attributes=True)

    name: str
    age: int

# 假设有一个 SQLAlchemy ORM 对象
class FakeOrmUser:
    name = "Alice"
    age = 30

orm_user = FakeOrmUser()
schema = UserSchema.model_validate(orm_user)
print(schema)  # name='Alice' age=30

常用方法速查

from pydantic import BaseModel

class Item(BaseModel):
    name: str
    price: float

item = Item(name="Book", price=29.9)

# 序列化
d = item.model_dump()                    # -> dict
j = item.model_dump_json()               # -> JSON string
j_indent = item.model_dump_json(indent=2)

# 反序列化
item2 = Item.model_validate({"name": "Pen", "price": 5.0})
item3 = Item.model_validate_json('{"name": "Pen", "price": 5.0}')

# JSON Schema
schema = Item.model_json_schema()
print(schema)
# {'properties': {'name': {'title': 'Name', 'type': 'string'},
#   'price': {'title': 'Price', 'type': 'number'}},
#  'required': ['name', 'price'], 'title': 'Item', 'type': 'object'}

# 部分更新(model_copy)
item4 = item.model_copy(update={"price": 19.9})
print(item4)  # name='Book' price=19.9

选型指南

用 dataclass 的场景

# ✅ 内部数据结构,不需要验证
@dataclass
class Config:
    host: str = "localhost"
    port: int = 8080
    debug: bool = False

# ✅ 追求零依赖
# ✅ 性能敏感的热路径(dataclass 比 Pydantic 快)
# ✅ 需要可变/不可变控制(frozen=True)

用 Pydantic 的场景

# ✅ API 请求/响应体
class CreateUserRequest(BaseModel):
    username: str
    password: str
    email: str

# ✅ 配置文件解析
from pydantic_settings import BaseSettings  # pip install pydantic-settings

class Settings(BaseSettings):
    database_url: str
    api_key: str
    debug: bool = False

    class Config:
        env_file = ".env"

settings = Settings()  # 自动从环境变量或 .env 读取

# ✅ 外部数据校验(用户输入、第三方 API 响应)
# ✅ 需要 JSON 序列化/反序列化
# ✅ FastAPI(内置 Pydantic)

attrs:第三方替代选项

# pip install attrs
import attr

@attr.s(auto_attribs=True)
class Point:
    x: float
    y: float = 0.0

    @x.validator
    def _check_x(self, attribute, value):
        if value < 0:
            raise ValueError("x must be non-negative")

p = Point(1.0, 2.0)
print(p)

attrs 比 dataclass 功能更丰富(validators、converters),但没有 Pydantic 的 JSON 序列化和生态支持。

总结对比表

特性 dataclass Pydantic v2 attrs
标准库
运行时验证
类型强制转换 部分
JSON 序列化 手动 内置 手动
性能 ⚡⚡⚡ ⚡⚡(v2 很快) ⚡⚡⚡
学习曲线
FastAPI 集成 ⭐⭐⭐

结论:内部结构用 dataclass,处理外部数据用 Pydantic v2,这是 Python 社区的主流做法。