3 min read

Parse JSON with dataclasses like pydantic

Parse JSON with dataclasses like pydantic

Reply to indently.io's dataclass approach

We get JSON parsed to a dict with json.load() or json.loads()

data = {
    "name": "Bob",
    "age": 29,
    "email": "bob@bob.com",
    "address": {
        "street": "42 Bob Avenue",
        "city": "Sydney",
        "state": "NSW",
        "postal_code": "2000",
        "country": "Australia",
    },
    "contacts": {
        "phone_numbers": ["+61 B0B 123 456", "+61 B0B 654 321"],
        "emergency_contact": {
            "name": "Bob",
            "relation": "Father",
            "phone": "+61, BOB 111 222"
        }
    }
}

We create type classes:

from dataclasses import dataclass


@dataclass
class EmergencyContact:
    name: str
    relation: str
    phone: str


@dataclass
class Contacts:
    phone_numbers: list[str]
    emergency_contact: EmergencyContact

@dataclass
class Address:
    street: str
    city: str
    state: str
    postal_code: str
    country: str

@dataclass
class Person:
    name: str
    age: int
    email: str
    address: Address
    contacts: Contacts

We cry, because this gives us only the top level attributes parsed while the nested classes are still pure dicts:

from pprint import pprint
pprint(Person(**data))
Person(name='Bob',
       age=29,
       email='bob@bob.com',
       address={'city': 'Sydney',
                'country': 'Australia',
                'postal_code': '2000',
                'state': 'NSW',
                'street': '42 Bob Avenue'},
       contacts={'emergency_contact': {'name': 'Bob',
                                       'phone': '+61, BOB 111 222',
                                       'relation': 'Father'},
                 'phone_numbers': ['+61 B0B 123 456', '+61 B0B 654 321']})

Approach #1 - be manual

Do as indently.io and manually instantiate all fields, but not thanks.

Approach #2 - use __post_init__

If you have like one nested dataclass, then why not just:

@dataclass
class Contacts(ParseSubDataclasses):
    phone_numbers: list[str]
    emergency_contact: EmergencyContact
    
    def __post_init__(self):
        if not isinstance(self.emergency_contact, EmergencyContact) and isinstance(self.emergency_contact, dict):
            self.emergency_contact = EmergencyContact(**self.emergency_contact)

Approach #3 - reusable base class

from dataclasses import dataclass, fields, is_dataclass

@dataclass
class ParseSubDataclasses:
    def __post_init__(self):
        for field in fields(self):
            if not is_dataclass(field.type):
                continue
            if isinstance(values := getattr(self, field.name), dict):
                setattr(self, field.name, field.type(**values))

Add it to all classes which have nested fields which are dataclasses:

@dataclass
class Contacts(ParseSubDataclasses):
    phone_numbers: list[str]
    emergency_contact: EmergencyContact


@dataclass
class Person(ParseSubDataclasses):
    name: str
    age: int
    email: str
    address: Address
    contacts: Contacts

Result:

Person(
    name="Bob",
    age=29,
    email="bob@bob.com",
    address=Address(
        street="42 Bob Avenue",
        city="Sydney",
        state="NSW",
        postal_code="2000",
        country="Australia",
    ),
    contacts=Contacts(
        phone_numbers=["+61 B0B 123 456", "+61 B0B 654 321"],
        emergency_contact=EmergencyContact(
            name="Bob",
            relation="Father",
            phone="+61, BOB 111 222",
        ),
    ),
)

Use when:

  • Nested subclasses are common in your models.
  • You need to be precise. Sometimes time efficiency is more important and running that code for all the attributes of a class which doesn't contain nested subclasses seems a bit redundant.
  • you are not sure that the model will stay this way i.e. it will get a new nested subclass.

Approach #4 - be recursive

def check_attribute_tree(self):
    for field in fields(self):
        if not is_dataclass(field.type):
            continue
        if isinstance(values := getattr(self, field.name), dict):
            setattr(self, field.name, check_attribute_tree(field.type(**values)))
    return self


@dataclass
class RootParseSubDataclasses:
    def __post_init__(self):
        check_attribute_tree(self)


@dataclass
class Person(RootParseSubDataclasses):
    name: str
    age: int
    email: str
    address: Address
    contacts: Contacts

Approach #5 - be pydantic

Pydantic also validates, whereas dataclasses are only validatable by mypy. But here you can go over each field and really check the types, not only for nested dataclasses

from dataclasses import dataclass, fields, is_dataclass
from typing import get_origin


def make_me_pydantic(self):
    for field in fields(self):
        value = getattr(self, field.name)
        if isinstance(value, get_origin(field.type) or field.type):
            continue
        if is_dataclass(field.type) and isinstance(value, dict):
            setattr(self, field.name, make_me_pydantic(field.type(**value)))
            continue
        raise ValueError(f"`{type(self).__name__}.{field.name} = {value!r}` should be of type {field.type=}, but is {type(value)}")
    return self


@dataclass
class RootParseSubDataclasses:
    def __post_init__(self):
        make_me_pydantic(self)


@dataclass
class Person(RootParseSubDataclasses):
    name: str
    age: int
    email: str
    address: Address
    contacts: Contacts

Let's change the Person.age in the data dict into a string, like:

data = {
    "name": "Bob",
    "age": "29",
    "email": "bob@bob.com",
    "address": {
        "street": "42 Bob Avenue",
        "city": "Sydney",
        "state": "NSW",
        "postal_code": "2000",
        "country": "Australia",
    },
    "contacts": {
        "phone_numbers": ["+61 B0B 123 456", "+61 B0B 654 321"],
        "emergency_contact": {"name": "Bob", "relation": "Father", "phone": "+61, BOB 111 222"},
    },
}

And we get:
ValueError: Person.age = '29' should be of type field.type=<class 'int'>, but is <class 'str'>