## routine needed to run the notebook on Google Colab
try:
    import google.colab

    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    !pip install "pina-mathlab[tutorial]"

import torch
import warnings

from pina import Trainer
from pina.solver import SupervisedSolver
from pina.problem.zoo import SupervisedProblem

from torch_geometric.datasets import QM9
from torch_geometric.nn import GCNConv, global_mean_pool

warnings.filterwarnings("ignore")

# download the data + shuffling
dataset = QM9(root="./tutorial_logs").shuffle()

# save the dataset
input_ = [data for data in dataset]
target_ = torch.cat([data.y for data in dataset])

# normalize the target
mean = target_.mean(dim=0, keepdim=True)
std = target_.std(dim=0, keepdim=True)
target_ = (target_ - mean) / std

Downloading https://data.pyg.org/datasets/qm9_v3.zip

Extracting tutorial_logs/raw/qm9_v3.zip

Processing...
Using a pre-processed version of the dataset. Please install 'rdkit' to alternatively process the raw data.

Done!

# build the problem
problem = SupervisedProblem(input_=input_, output_=target_)

class GNN(torch.nn.Module):
    def __init__(self, in_features, out_features, hidden_dim=256):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(in_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.fc = torch.nn.Linear(hidden_dim, out_features)

    def forward(self, data):
        # extract attributes, N.B. in PINA Data object are passed as input
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # perform normal graph operations
        x = torch.relu(self.conv1(x, edge_index))
        x = torch.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return self.fc(x)

# define the solver
solver = SupervisedSolver(
    problem=problem,
    model=GNN(in_features=11, out_features=19),
    use_lt=False,
    loss=torch.nn.L1Loss(),
)
trainer = Trainer(
    solver,
    max_epochs=3,
    train_size=0.7,
    test_size=0.2,
    val_size=0.1,
    batch_size=512,
    accelerator="cpu",
    enable_model_summary=False,
)
trainer.train()

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.

GPU available: False, used: False

TPU available: False, using: 0 TPU cores

HPU available: False, using: 0 HPUs

`Trainer.fit` stopped: `max_epochs=3` reached.

_ = trainer.test()

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     test_loss_epoch        0.4035990834236145
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

# get the test dataset
test_dataset = trainer.datamodule.test_dataset.get_all_data()
print("Here the dataset")
print(f"Dataset keys: {test_dataset.keys()}")
print(f"Dataset keys for data condition: {test_dataset['data'].keys()}")
print(
    f"Dataset values type for data condition: {[v.__class__.__name__ for v in test_dataset['data'].values()]}"
)

# extract input and target for test dataset
input_test = test_dataset["data"]["input"]
target_test = test_dataset["data"]["target"]

Here the dataset
Dataset keys: dict_keys(['data'])
Dataset keys for data condition: dict_keys(['input', 'target'])
Dataset values type for data condition: ['DataLabelBatch', 'Tensor']

# get the prediction
prediction_test = solver(input_test)
print(f"Number of prediction properties: {prediction_test.shape[-1]}")

Number of prediction properties: 19

properties = [
    "μ",
    "α",
    "ε HOMO",
    "ε LUMO",
    "Δε",
    "⟨R²⟩",
    "ZPVE",
    "U₀",
    "U",
    "H",
    "G",
    "cv",
    "U₀ ATOM",
    "U ATOM",
    "H ATOM",
    "G ATOM",
    "A",
    "B",
    "C",
]

units = [
    "D",
    "a₀³",
    "eV",
    "eV",
    "eV",
    "a₀²",
    "eV",
    "eV",
    "eV",
    "eV",
    "eV",
    "cal/(mol·K)",
    "eV",
    "eV",
    "eV",
    "eV",
    "GHz",
    "GHz",
    "GHz",
]

print(f"{'Property':<10} | {'Error':<8} | {'Unit'}")
print("-" * 34)

for idx in range(19):
    error = torch.abs(prediction_test[:, idx] - target_test[:, idx]).mean()
    print(f"{properties[idx]:<10} | {error:.4f}   | {units[idx]}")

Property   | Error    | Unit
----------------------------------
μ          | 0.6903   | D
α          | 0.4514   | a₀³
ε HOMO     | 0.6874   | eV
ε LUMO     | 0.6221   | eV
Δε         | 0.6680   | eV
⟨R²⟩       | 0.6613   | a₀²
ZPVE       | 0.2487   | eV
U₀         | 0.3803   | eV
U          | 0.3811   | eV
H          | 0.3786   | eV
G          | 0.3813   | eV
cv         | 0.5490   | cal/(mol·K)
U₀ ATOM    | 0.2847   | eV
U ATOM     | 0.2835   | eV
H ATOM     | 0.2831   | eV
G ATOM     | 0.2879   | eV
A          | 0.0018   | GHz
B          | 0.2134   | GHz
C          | 0.2142   | GHz

import matplotlib.pyplot as plt

# Set up the plot grid
num_properties = 19
fig, axes = plt.subplots(4, 5, figsize=(10, 8))
axes = axes.flatten()

# Outlier removal using IQR (with torch)
for idx in range(num_properties):
    target_vals = target_test[:, idx]
    pred_vals = prediction_test[:, idx]

    # Calculate Q1 (25th percentile) and Q3 (75th percentile) using torch
    Q1 = torch.quantile(target_vals, 0.25)
    Q3 = torch.quantile(target_vals, 0.75)
    IQR = Q3 - Q1

    # Define the outlier range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Filter out the outliers
    mask = (target_vals >= lower_bound) & (target_vals <= upper_bound)
    filtered_target = target_vals[mask]
    filtered_pred = pred_vals[mask]

    # Plotting
    ax = axes[idx]
    ax.scatter(
        filtered_target.detach(),
        filtered_pred.detach(),
        alpha=0.5,
        label="Data points (no outliers)",
    )
    ax.plot(
        [filtered_target.min().item(), filtered_target.max().item()],
        [filtered_target.min().item(), filtered_target.max().item()],
        "r--",
        label="y=x",
    )

    ax.set_title(properties[idx])
    ax.set_xlabel("Target")
    ax.set_ylabel("Prediction")

# Remove the extra subplot (since there are 19 properties, not 20)
if num_properties < len(axes):
    fig.delaxes(axes[-1])

plt.tight_layout()
plt.show()

Target	Property	Description	Unit
0	$\mu$	Dipole moment	$D$
1	$\alpha$	Isotropic polarizability	$a₀³$
2	$\epsilon_{\textrm{HOMO}}$	Highest occupied molecular orbital energy	$eV$
3	$\epsilon_{\textrm{LUMO}}$	Lowest unoccupied molecular orbital energy	$eV$
4	$\Delta \epsilon$	Gap between $\epsilon_{\textrm{HOMO}}$ and $\epsilon_{\textrm{LUMO}}$	$eV$
5	$\langle R^2 \rangle$	Electronic spatial extent	$a₀²$
6	$\textrm{ZPVE}$	Zero point vibrational energy	$eV$
7	$U_0$	Internal energy at 0K	$eV$
8	$U$	Internal energy at 298.15K	$eV$
9	$H$	Enthalpy at 298.15K	$eV$
10	$G$	Free energy at 298.15K	$eV$
11	$c_{\textrm{v}}$	Heat capacity at 298.15K	$cal/(mol·K)$
12	$U_0^{\textrm{ATOM}}$	Atomization energy at 0K	$eV$
13	$U^{\textrm{ATOM}}$	Atomization energy at 298.15K	$eV$
14	$H^{\textrm{ATOM}}$	Atomization enthalpy at 298.15K	$eV$
15	$G^{\textrm{ATOM}}$	Atomization free energy at 298.15K	$eV$
16	$A$	Rotational constant	$GHz$
17	$B$	Rotational constant	$GHz$
18	$C$	Rotational constant	$GHz$

Tutorial: Chemical Properties Prediction with Graph Neural Networks¶

Download Data and create the Problem¶

Build the Model¶

Train the Model¶

Testing Chemical Predictions¶

What's Next?¶