Python dataclass vs Pydantic:数据类选型指南
一句话总结
- dataclass:标准库,零依赖,适合内部数据结构
- Pydantic v2:带验证 + 序列化,适合处理外部数据(API/配置)
dataclass 核心用法
from dataclasses import dataclass, field
from typing import ClassVar
@dataclass
class User:
name: str
age: int
email: str = ""
tags: list[str] = field(default_factory=list)
# ClassVar 不会成为实例字段
table_name: ClassVar[str] = "users"
# 创建实例
u = User("Alice", 30, "alice@example.com", ["admin"])
print(u) # User(name='Alice', age=30, email='alice@example.com', tags=['admin'])
# 比较(自动实现 __eq__)
u2 = User("Alice", 30, "alice@example.com", ["admin"])
print(u == u2) # True
post_init:初始化后处理
from dataclasses import dataclass
import re
@dataclass
class Email:
address: str
def __post_init__(self):
if not re.match(r"[^@]+@[^@]+\.[^@]+", self.address):
raise ValueError(f"Invalid email: {self.address}")
self.address = self.address.lower() # 统一小写
e = Email("Alice@Example.COM")
print(e.address) # alice@example.com
try:
Email("not-an-email")
except ValueError as err:
print(err) # Invalid email: not-an-email
frozen=True:不可变数据类
from dataclasses import dataclass
@dataclass(frozen=True)
class Point:
x: float
y: float
def distance(self) -> float:
return (self.x ** 2 + self.y ** 2) ** 0.5
p = Point(3.0, 4.0)
print(p.distance()) # 5.0
# p.x = 1.0 # FrozenInstanceError
# frozen=True 的 dataclass 可以作为 dict key 或放入 set
points = {p, Point(1.0, 0.0)}
dataclass 的局限
from dataclasses import dataclass
@dataclass
class Order:
price: float
quantity: int
# 1. 没有运行时验证
o = Order(price="not a float", quantity="abc") # 不报错!
print(o) # Order(price='not a float', quantity='abc')
# 2. JSON 序列化要自己写
import json
from dataclasses import asdict
o2 = Order(99.9, 2)
print(json.dumps(asdict(o2))) # {"price": 99.9, "quantity": 2}
# 但嵌套复杂类型时 asdict 可能不够用
# 3. 从 dict 反序列化没有内置方法
data = {"price": 99.9, "quantity": 2}
o3 = Order(**data) # 手动解包,无验证
Pydantic v2 核心用法
# pip install pydantic
from pydantic import BaseModel, Field, field_validator
from typing import Annotated
class User(BaseModel):
name: str
age: int = Field(ge=0, le=150) # ge=大于等于, le=小于等于
email: str = ""
tags: list[str] = []
# 自动验证
u = User(name="Alice", age=30, email="alice@example.com")
print(u)
# name='Alice' age=30 email='alice@example.com' tags=[]
# 类型强制转换
u2 = User(name="Bob", age="25") # "25" 自动转为 int
print(u2.age, type(u2.age)) # 25 <class 'int'>
# 验证失败
try:
User(name="Eve", age=200) # 超过 le=150
except Exception as e:
print(e)
# 1 validation error for User
# age: Input should be less than or equal to 150
Pydantic v2 性能
Pydantic v2 用 Rust 重写了核心逻辑(pydantic-core),比 v1 快 5-50 倍:
pip install pydantic # 默认安装 v2
python -c "import pydantic; print(pydantic.__version__)" # 2.x.x
validators:字段验证
from pydantic import BaseModel, field_validator, model_validator
import re
class SignupForm(BaseModel):
username: str
password: str
confirm_password: str
email: str
@field_validator("username")
@classmethod
def username_valid(cls, v: str) -> str:
if not re.match(r"^[a-zA-Z0-9_]{3,20}$", v):
raise ValueError("用户名只能包含字母数字下划线,3-20字符")
return v.lower()
@field_validator("email")
@classmethod
def email_valid(cls, v: str) -> str:
if "@" not in v:
raise ValueError("邮箱格式错误")
return v.lower()
@model_validator(mode="after")
def passwords_match(self) -> "SignupForm":
if self.password != self.confirm_password:
raise ValueError("两次密码不一致")
return self
# 测试
try:
SignupForm(
username="alice",
password="secret123",
confirm_password="wrong",
email="alice@example.com",
)
except Exception as e:
print(e)
# passwords_match: 两次密码不一致
嵌套模型和列表验证
from pydantic import BaseModel
class Address(BaseModel):
street: str
city: str
zip_code: str
class Order(BaseModel):
id: int
items: list[str]
address: Address
total: float
# 从嵌套 dict 创建
data = {
"id": 1,
"items": ["book", "pen"],
"address": {"street": "123 Main St", "city": "Shanghai", "zip_code": "200000"},
"total": 49.99,
}
order = Order(**data)
print(order.address.city) # Shanghai
# 序列化
print(order.model_dump())
print(order.model_dump_json())
model_config:行为配置
from pydantic import BaseModel, ConfigDict
class StrictUser(BaseModel):
model_config = ConfigDict(strict=True) # 不允许类型强制转换
age: int
# strict 模式下,"25" 不会自动转为 int
try:
StrictUser(age="25")
except Exception as e:
print(e) # age: Input should be a valid integer, got a string
# from_attributes=True:支持从 ORM 对象创建(替代 v1 的 orm_mode)
class UserSchema(BaseModel):
model_config = ConfigDict(from_attributes=True)
name: str
age: int
# 假设有一个 SQLAlchemy ORM 对象
class FakeOrmUser:
name = "Alice"
age = 30
orm_user = FakeOrmUser()
schema = UserSchema.model_validate(orm_user)
print(schema) # name='Alice' age=30
常用方法速查
from pydantic import BaseModel
class Item(BaseModel):
name: str
price: float
item = Item(name="Book", price=29.9)
# 序列化
d = item.model_dump() # -> dict
j = item.model_dump_json() # -> JSON string
j_indent = item.model_dump_json(indent=2)
# 反序列化
item2 = Item.model_validate({"name": "Pen", "price": 5.0})
item3 = Item.model_validate_json('{"name": "Pen", "price": 5.0}')
# JSON Schema
schema = Item.model_json_schema()
print(schema)
# {'properties': {'name': {'title': 'Name', 'type': 'string'},
# 'price': {'title': 'Price', 'type': 'number'}},
# 'required': ['name', 'price'], 'title': 'Item', 'type': 'object'}
# 部分更新(model_copy)
item4 = item.model_copy(update={"price": 19.9})
print(item4) # name='Book' price=19.9
选型指南
用 dataclass 的场景
# ✅ 内部数据结构,不需要验证
@dataclass
class Config:
host: str = "localhost"
port: int = 8080
debug: bool = False
# ✅ 追求零依赖
# ✅ 性能敏感的热路径(dataclass 比 Pydantic 快)
# ✅ 需要可变/不可变控制(frozen=True)
用 Pydantic 的场景
# ✅ API 请求/响应体
class CreateUserRequest(BaseModel):
username: str
password: str
email: str
# ✅ 配置文件解析
from pydantic_settings import BaseSettings # pip install pydantic-settings
class Settings(BaseSettings):
database_url: str
api_key: str
debug: bool = False
class Config:
env_file = ".env"
settings = Settings() # 自动从环境变量或 .env 读取
# ✅ 外部数据校验(用户输入、第三方 API 响应)
# ✅ 需要 JSON 序列化/反序列化
# ✅ FastAPI(内置 Pydantic)
attrs:第三方替代选项
# pip install attrs
import attr
@attr.s(auto_attribs=True)
class Point:
x: float
y: float = 0.0
@x.validator
def _check_x(self, attribute, value):
if value < 0:
raise ValueError("x must be non-negative")
p = Point(1.0, 2.0)
print(p)
attrs 比 dataclass 功能更丰富(validators、converters),但没有 Pydantic 的 JSON 序列化和生态支持。
总结对比表
| 特性 | dataclass | Pydantic v2 | attrs |
|---|---|---|---|
| 标准库 | ✅ | ❌ | ❌ |
| 运行时验证 | ❌ | ✅ | ✅ |
| 类型强制转换 | ❌ | ✅ | 部分 |
| JSON 序列化 | 手动 | 内置 | 手动 |
| 性能 | ⚡⚡⚡ | ⚡⚡(v2 很快) | ⚡⚡⚡ |
| 学习曲线 | 低 | 中 | 中 |
| FastAPI 集成 | 差 | ⭐⭐⭐ | 差 |
结论:内部结构用 dataclass,处理外部数据用 Pydantic v2,这是 Python 社区的主流做法。