Design molecules and materials by learning from property preferences
Instead of explicitly defining molecular properties, researchers can rate generated molecules based on multiple criteria, and PLGL learns the complex trade-offs automatically.
Create diverse molecular structures from latent space
Rate based on activity, toxicity, synthesizability
Navigate to molecules with ideal property balance
Encode molecular structures into a continuous latent space:
import torch
import torch.nn as nn
from rdkit import Chem
from rdkit.Chem import Descriptors
class MolecularVAE(nn.Module):
"""VAE for molecular generation using SMILES representation"""
def __init__(self, vocab_size, latent_dim=256):
super().__init__()
self.latent_dim = latent_dim
# Encoder: SMILES → Latent
self.encoder = nn.LSTM(
input_size=vocab_size,
hidden_size=512,
num_layers=3,
batch_first=True
)
self.fc_mu = nn.Linear(512, latent_dim)
self.fc_logvar = nn.Linear(512, latent_dim)
# Decoder: Latent → SMILES
self.decoder = nn.LSTM(
input_size=latent_dim,
hidden_size=512,
num_layers=3,
batch_first=True
)
self.output = nn.Linear(512, vocab_size)
def encode(self, x):
_, (h, _) = self.encoder(x)
h = h[-1] # Last hidden state
return self.fc_mu(h), self.fc_logvar(h)
def decode(self, z):
# Expand z for sequence generation
z = z.unsqueeze(1).repeat(1, self.max_length, 1)
output, _ = self.decoder(z)
return self.output(output)
Learn complex trade-offs between molecular properties:
class MolecularPreferenceLearner:
def __init__(self, molecular_vae):
self.vae = molecular_vae
self.samples = []
# Multi-output preference model
self.preference_model = nn.Sequential(
nn.Linear(molecular_vae.latent_dim, 512),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(512, 256),
nn.ReLU(),
nn.Linear(256, 128),
nn.ReLU(),
nn.Linear(128, 1),
nn.Sigmoid()
)
def compute_molecular_properties(self, smiles):
"""Compute key molecular properties"""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
properties = {
'molecular_weight': Descriptors.MolWt(mol),
'logp': Descriptors.MolLogP(mol), # Lipophilicity
'qed': Descriptors.qed(mol), # Drug-likeness
'sa_score': self.synthetic_accessibility(mol),
'num_rings': Descriptors.RingCount(mol),
'num_hbd': Descriptors.NumHDonors(mol),
'num_hba': Descriptors.NumHAcceptors(mol)
}
return properties
def collect_preferences(self, n_samples=100):
"""Collect preferences with property visualization"""
for i in range(n_samples):
# Generate molecule
z = torch.randn(1, self.vae.latent_dim)
smiles = self.vae.decode_to_smiles(z)
# Compute properties
props = self.compute_molecular_properties(smiles)
# Display to chemist for rating
print(f"\nMolecule {i+1}:")
print(f"SMILES: {smiles}")
print(f"MW: {props['molecular_weight']:.1f}")
print(f"LogP: {props['logp']:.2f}")
print(f"QED: {props['qed']:.2f}")
print(f"Synthetic accessibility: {props['sa_score']:.2f}")
# Get rating (0-1) based on overall desirability
rating = get_chemist_rating()
self.samples.append({
'latent': z,
'smiles': smiles,
'properties': props,
'rating': rating
})
Navigate latent space considering multiple objectives:
def optimize_molecule(self, target_properties=None, n_steps=1000):
"""Find molecules with desired property profile"""
# Start from promising region if we have samples
if self.samples:
# Find best-rated sample as starting point
best_idx = np.argmax([s['rating'] for s in self.samples])
z = self.samples[best_idx]['latent'].clone()
else:
z = torch.randn(1, self.vae.latent_dim)
z.requires_grad = True
optimizer = torch.optim.Adam([z], lr=0.01)
for step in range(n_steps):
# Generate molecule
smiles = self.vae.decode_to_smiles(z)
props = self.compute_molecular_properties(smiles)
if props is not None:
# Score with preference model
score = self.preference_model(z)
# Add property constraints if specified
if target_properties:
property_loss = 0
for prop, target in target_properties.items():
if prop in props:
# Squared error from target
property_loss += (props[prop] - target) ** 2
# Combined objective
loss = -score + 0.1 * property_loss
else:
loss = -score
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Constrain to valid latent space
with torch.no_grad():
z.clamp_(-3, 3)
return self.vae.decode_to_smiles(z.detach())
Generate variations while preserving core structure:
def generate_analogs(self, scaffold_smiles, n_analogs=20):
"""Generate molecules with same scaffold but different properties"""
# Encode scaffold
scaffold_z = self.vae.encode_smiles(scaffold_smiles)
analogs = []
for i in range(n_analogs):
# Add controlled noise
noise = torch.randn_like(scaffold_z) * 0.1
z_variant = scaffold_z + noise
# Decode to molecule
smiles = self.vae.decode_to_smiles(z_variant)
# Check if scaffold is preserved
if self.contains_scaffold(smiles, scaffold_smiles):
score = self.preference_model(z_variant).item()
props = self.compute_molecular_properties(smiles)
analogs.append({
'smiles': smiles,
'score': score,
'properties': props
})
# Sort by preference score
analogs.sort(key=lambda x: x['score'], reverse=True)
return analogs
class MaterialPreferenceLearner:
"""PLGL for material property optimization"""
def __init__(self, crystal_vae):
self.vae = crystal_vae # VAE for crystal structures
self.preference_model = self.build_preference_model()
def optimize_for_properties(self, preferences):
"""
Find materials matching property preferences:
- Mechanical: strength, ductility, hardness
- Electrical: conductivity, band gap
- Thermal: melting point, thermal expansion
- Chemical: stability, reactivity
"""
# Start from random crystal structure
z = torch.randn(1, self.vae.latent_dim)
for step in range(1000):
# Generate crystal structure
structure = self.vae.decode(z)
# Predict properties (using ML or DFT)
properties = self.predict_properties(structure)
# Score based on preferences
score = self.preference_model(z)
# Update latent code
z = self.gradient_step(z, score)
return structure
Design drugs with optimal balance of efficacy, safety, and manufacturability.
Discover materials with ideal energy density and stability trade-offs.
Engineer proteins with desired function while maintaining stability.
Design materials balancing durability, cost, and environmental impact.
Ready to apply PLGL to scientific discovery?