Parse JSON with dataclasses like pydantic
Reply to indently.io's dataclass approach
We get JSON parsed to a dict with json.load() or json.loads()
data = {
"name": "Bob",
"age": 29,
"email": "bob@bob.com",
"address": {
"street": "42 Bob Avenue",
"city": "Sydney",
"state": "NSW",
"postal_code": "2000",
"country": "Australia",
},
"contacts": {
"phone_numbers": ["+61 B0B 123 456", "+61 B0B 654 321"],
"emergency_contact": {
"name": "Bob",
"relation": "Father",
"phone": "+61, BOB 111 222"
}
}
}We create type classes:
from dataclasses import dataclass
@dataclass
class EmergencyContact:
name: str
relation: str
phone: str
@dataclass
class Contacts:
phone_numbers: list[str]
emergency_contact: EmergencyContact
@dataclass
class Address:
street: str
city: str
state: str
postal_code: str
country: str
@dataclass
class Person:
name: str
age: int
email: str
address: Address
contacts: ContactsWe cry, because this gives us only the top level attributes parsed while the nested classes are still pure dicts:
from pprint import pprint
pprint(Person(**data))
Person(name='Bob',
age=29,
email='bob@bob.com',
address={'city': 'Sydney',
'country': 'Australia',
'postal_code': '2000',
'state': 'NSW',
'street': '42 Bob Avenue'},
contacts={'emergency_contact': {'name': 'Bob',
'phone': '+61, BOB 111 222',
'relation': 'Father'},
'phone_numbers': ['+61 B0B 123 456', '+61 B0B 654 321']})
Approach #1 - be manual
Do as indently.io and manually instantiate all fields, but not thanks.
Approach #2 - use __post_init__
If you have like one nested dataclass, then why not just:
@dataclass
class Contacts(ParseSubDataclasses):
phone_numbers: list[str]
emergency_contact: EmergencyContact
def __post_init__(self):
if not isinstance(self.emergency_contact, EmergencyContact) and isinstance(self.emergency_contact, dict):
self.emergency_contact = EmergencyContact(**self.emergency_contact)Approach #3 - reusable base class
from dataclasses import dataclass, fields, is_dataclass
@dataclass
class ParseSubDataclasses:
def __post_init__(self):
for field in fields(self):
if not is_dataclass(field.type):
continue
if isinstance(values := getattr(self, field.name), dict):
setattr(self, field.name, field.type(**values))Add it to all classes which have nested fields which are dataclasses:
@dataclass
class Contacts(ParseSubDataclasses):
phone_numbers: list[str]
emergency_contact: EmergencyContact
@dataclass
class Person(ParseSubDataclasses):
name: str
age: int
email: str
address: Address
contacts: ContactsResult:
Person(
name="Bob",
age=29,
email="bob@bob.com",
address=Address(
street="42 Bob Avenue",
city="Sydney",
state="NSW",
postal_code="2000",
country="Australia",
),
contacts=Contacts(
phone_numbers=["+61 B0B 123 456", "+61 B0B 654 321"],
emergency_contact=EmergencyContact(
name="Bob",
relation="Father",
phone="+61, BOB 111 222",
),
),
)
Use when:
- Nested subclasses are common in your models.
- You need to be precise. Sometimes time efficiency is more important and running that code for all the attributes of a class which doesn't contain nested subclasses seems a bit redundant.
- you are not sure that the model will stay this way i.e. it will get a new nested subclass.
Approach #4 - be recursive
def check_attribute_tree(self):
for field in fields(self):
if not is_dataclass(field.type):
continue
if isinstance(values := getattr(self, field.name), dict):
setattr(self, field.name, check_attribute_tree(field.type(**values)))
return self
@dataclass
class RootParseSubDataclasses:
def __post_init__(self):
check_attribute_tree(self)
@dataclass
class Person(RootParseSubDataclasses):
name: str
age: int
email: str
address: Address
contacts: ContactsApproach #5 - be pydantic
Pydantic also validates, whereas dataclasses are only validatable by mypy. But here you can go over each field and really check the types, not only for nested dataclasses
from dataclasses import dataclass, fields, is_dataclass
from typing import get_origin
def make_me_pydantic(self):
for field in fields(self):
value = getattr(self, field.name)
if isinstance(value, get_origin(field.type) or field.type):
continue
if is_dataclass(field.type) and isinstance(value, dict):
setattr(self, field.name, make_me_pydantic(field.type(**value)))
continue
raise ValueError(f"`{type(self).__name__}.{field.name} = {value!r}` should be of type {field.type=}, but is {type(value)}")
return self
@dataclass
class RootParseSubDataclasses:
def __post_init__(self):
make_me_pydantic(self)
@dataclass
class Person(RootParseSubDataclasses):
name: str
age: int
email: str
address: Address
contacts: ContactsLet's change the Person.age in the data dict into a string, like:
data = {
"name": "Bob",
"age": "29",
"email": "bob@bob.com",
"address": {
"street": "42 Bob Avenue",
"city": "Sydney",
"state": "NSW",
"postal_code": "2000",
"country": "Australia",
},
"contacts": {
"phone_numbers": ["+61 B0B 123 456", "+61 B0B 654 321"],
"emergency_contact": {"name": "Bob", "relation": "Father", "phone": "+61, BOB 111 222"},
},
}And we get:ValueError: Person.age = '29' should be of type field.type=<class 'int'>, but is <class 'str'>