pytorch_geometric Issue with GNN using own heterogeneous dataset

I have generated a heterogeneous dataset based on some csv files. areaDZ noded have features while areaNoFeature do not have features.

HeteroData(
  areaNoFeature={ num_nodes=1316 },
  areaDZ={
    x=[5841, 23],
    y=[5841],
    train_mask=[5841],
    val_mask=[5841],
    test_mask=[5841]
  },
  (areaDZ, parent, areaNoFeature)={ edge_index=[2, 5841] },
  (areaNoFeature, parent, areaNoFeature)={ edge_index=[2, 2630] },
  (areaNoFeature, rev_parent, areaDZ)={ edge_index=[2, 5841] }
)

The masks are created using the RandomNodeSplit

transform = RandomNodeSplit(split='train_rest', num_val=100, num_test=0.25)
data = transform(data)

I try to train a GNN using some code provided at github but I get an error:

train_input_nodes = ('areaDZ', data['areaDZ'].train_mask)
val_input_nodes = ('areaDZ', data['areaDZ'].val_mask)
kwargs = {'batch_size': 32, 'num_workers': 6, 'persistent_workers': True}

train_loader = NeighborLoader(data, num_neighbors=[10] * 2, shuffle=True, input_nodes=train_input_nodes, **kwargs)
val_loader = NeighborLoader(data, num_neighbors=[10] * 2, input_nodes=val_input_nodes, **kwargs)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = Sequential('x, edge_index', [
    (SAGEConv((-1, -1), 64), 'x, edge_index -> x'),
    ReLU(inplace=True),
    (SAGEConv((-1, -1), 64), 'x, edge_index -> x'),
    ReLU(inplace=True),
    (Linear(-1, 2), 'x -> x'),
])
model = to_hetero(model, data.metadata(), aggr='sum').to(device)

@torch.no_grad()
def init_params():
    # Initialize lazy parameters via forwarding a single batch to the model:
    batch = next(iter(train_loader))
    batch = batch.to(device)
    model(batch.x_dict, batch.edge_index_dict)

def train():
    model.train()
    total_examples = total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        batch = batch.to(device)
        batch_size = batch['areaDZ'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)['areaDZ'][:batch_size]
        loss = F.cross_entropy(out, batch['areaDZ'].y[:batch_size])
        loss.backward()
        optimizer.step()

        total_examples += batch_size
        total_loss += float(loss) * batch_size

    return total_loss / total_examples


@torch.no_grad()
def test(loader):
    model.eval()

    total_examples = total_correct = 0
    for batch in tqdm(loader):
        batch = batch.to(device)
        batch_size = batch['areaDZ'].batch_size
        out = model(batch.x_dict, batch.edge_index_dict)['areaDZ'][:batch_size]
        pred = out.argmax(dim=-1)

        total_examples += batch_size
        total_correct += int((pred == batch['areaDZ'].y[:batch_size]).sum())

    return total_correct / total_examples

init_params()  # Initialize parameters.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(1, 21):
    loss = train()
    val_acc = test(val_loader)
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Val: {val_acc:.4f}')

The error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_2445/3976418620.py in <module>
----> 1 init_params()  # Initialize parameters.
      2 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
      3 
      4 for epoch in range(1, 21):
      5     loss = train()

/opt/conda/lib/python3.7/site-packages/torch/autograd/grad_mode.py in decorate_context(*args, **kwargs)
     26         def decorate_context(*args, **kwargs):
     27             with self.__class__():
---> 28                 return func(*args, **kwargs)
     29         return cast(F, decorate_context)
     30 

/tmp/ipykernel_2445/1078389677.py in init_params()
     29     batch = next(iter(train_loader))
     30     batch = batch.to(device)
---> 31     model(batch.x_dict, batch.edge_index_dict)
     32 
     33 

/opt/conda/lib/python3.7/site-packages/torch/fx/graph_module.py in wrapped_call(self, *args, **kwargs)
    511                     print(generate_error_message(topmost_framesummary),
    512                           file=sys.stderr)
--> 513                 raise e.with_traceback(None)
    514 
    515         cls.__call__ = wrapped_call

AttributeError: 'NoneType' object has no attribute 'dim'

Any help will be much appreciated!

Nov 30 '21 15:11 zeginis

All node types will need to have some features to enable message passing. In case they are not given, you can use torch.nn.Embedding to learn them, e.g.:

class MyModel(torch.nn.Module):
    def __init__(self, ...):
        self.emb = Embedding(1316, 64)

        self.model = to_hetero(model)
        
    def forward(self, x_dict, edge_index_dict):
        x_dict = copy.copy(x_dict)
        x_dict["areaNoFeature"] = self.emb.weight
        return self.model(x_dict, edge_index_dict)

Nov 30 '21 17:11 rusty1s

@rusty1s apologies for resurrecting an old post.

I am attempting to use Embedding layers for some of my node attributes in my heterogeneous graph. It seems to run but I don't think the weights of the embedding layers are being updated at all.

This is my custom heterogeneous model:

class GCN(nn.Module):
        def __init__(self, data, in_channels: Union[int, Dict[str, int]],
                    out_channels: int, hidden_channels=hidden_channels, heads=8):
            super().__init__()
            self.conv1 = HANConv(in_channels, hidden_channels, heads=heads, metadata=data.metadata())
            self.lin1 = nn.Linear(hidden_channels, hidden_channels)
            self.lin2 = nn.Linear(hidden_channels, out_channels)
            # learnable embeddings for all extra nodes:
            self.embeddings = {
                nt: torch.nn.Embedding(len(data.x_dict[nt]), attribute_embedding_length)
                    for nt in extra_node_types
                }
            for nt in extra_node_types:
                self.embeddings[nt].requires_grad = True # force trainable
                self.embeddings[nt].to(device)

        def forward(self, x_dict, edge_index_dict):
            # use embeddings for selected nodes
            # TODO: this does not train them!
            x_dict = x_dict.copy()
            for nt in extra_node_types:
                x_dict[nt] = self.embeddings[nt].weight
            print(x_dict[extra_node_types[0]][0])

            out = self.conv1(x_dict, edge_index_dict)
            self.hidden_state = self.lin1(out[node_type])
            out = F.dropout(self.hidden_state, p=0.5, training=self.training)
            return self.lin2(out)

extra_node_types is a list of some of my node types I want embeddings for.

When printing out one of the embeddings for a node I get always the same vector, no matter how many epochs I train the model. Any help would be truly appreciated!

Jan 27 '22 12:01 carlosSorcero

You will need to use torch.nn.ModuleDict in order to let PyTorch optimize your embedding:

            self.embeddings = torch.nn.ModuleDict({
                nt: torch.nn.Embedding(len(data.x_dict[nt]), attribute_embedding_length)
                    for nt in extra_node_types
                })

As a result, the calls to requires_grad and to(...) can be dropped.

Jan 28 '22 08:01 rusty1s

Thanks for that. I made the changes but the embeddings are still not changing. I noticed your suggestion of copying the embeddings weights to the x_dict, but I don't see them ever coming back to the embedding layer. Maybe that's what I'm missing?

class GCN(nn.Module):
        def __init__(self, data, in_channels: Union[int, Dict[str, int]],
                    out_channels: int, hidden_channels=hidden_channels, heads=8):
            super().__init__()
            self.conv1 = HANConv(in_channels, hidden_channels, heads=heads, metadata=data.metadata())
            self.lin1 = nn.Linear(hidden_channels, hidden_channels)
            self.lin2 = nn.Linear(hidden_channels, out_channels)
            # learnable embeddings for all extra nodes:
            self.embeddings = torch.nn.ModuleDict({
                nt: torch.nn.Embedding(len(data.x_dict[nt]), attribute_embedding_length)
                    for nt in extra_node_types
                })

        def forward(self, x_dict, edge_index_dict):
            # use embeddings for selected nodes
            # TODO: this does not train them!
            x_dict = x_dict.copy()
            for nt in extra_node_types:
                x_dict[nt] = self.embeddings[nt].weight
            print(sum(x_dict[extra_node_types[0]]))

            out = self.conv1(x_dict, edge_index_dict)
            self.hidden_state = self.lin1(out[node_type])
            out = F.dropout(self.hidden_state, p=0.5, training=self.training)
            return self.lin2(out)

Jan 28 '22 09:01 carlosSorcero

This looks correct to me, and I'm pretty sure the embeddings should change. The only reason I can think of them not changing is that there exists no edge type pointing to them in data.metadata().

Jan 30 '22 12:01 rusty1s

Got it, not really sure what I've changed but I indeed see the embeddings training now! Thanks as usual :)

Feb 04 '22 12:02 carlosSorcero

Hi @rusty1s,

I am getting an error along the lines of what @zeginis was getting, but in my case I have features for all the node types. Here is what my data object looks like this (I'm using all data for training right now just for starting with a simple example):

HeteroData(
  A={
    x=[85728, 12],
    nodeids=[85728]
  },
  B={
    x=[11437, 12],
    nodeids=[11437]
  },
  C={
    x=[114350, 12],
    nodeids=[114350]
  },
  (A, A_to_A_edge, A)={
    edge_index=[2, 87914],
    edge_label=[87914],
    edge_label_index=[2, 87914]
  },
  (B, B_to_A_edge, A)={
    edge_index=[2, 54857],
    edge_label=[54857],
    edge_label_index=[2, 54857]
  },
  (C, C_to_A_edge, A)={
    edge_index=[2, 205],
    edge_label=[205],
    edge_label_index=[2, 205]
  },
  (A, A_to_C_edge, C)={
    edge_index=[2, 4222],
    edge_label=[4222],
    edge_label_index=[2, 4222]
  },
  (B, B_to_C_edge, C)={
    edge_index=[2, 120292],
    edge_label=[120292],
    edge_label_index=[2, 120292]
  },
  (C, C_to_C_edge, C)={
    edge_index=[2, 191660],
    edge_label=[191660],
    edge_label_index=[2, 191660]
  }
)

I created the edge_label and edge_label_index attributes for each edge type myself, and there is no negative sampling -- I just wanted a very simple working example before I start adding more things on top of my data/model. Thus, edge_label is all ones for all edge types and edge_label_index equals edge_index for all the edge types.

Here is my model:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels) # lazy initialization
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index, edge_type): # extra param edge_type
        row, col = edge_label_index
        z = torch.cat([z_dict[edge_type[0]][row], z_dict[edge_type[2]][col]], dim=-1) # how to extend to all edge types? M Fey. want link predictions for all node types

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1) # the edge decoder returns a number for each edge since the last linear layer has output dim = 1 -- this number could be passed to a sigmoid function and make it a probability


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index, edge_type): # added the new parameter edge_type to generalize the pipeline to all the edge types
        z_dict = self.encoder(x_dict, edge_index_dict)
        return z_dict, self.decoder(z_dict, edge_label_index, edge_type) # same reason as above for why I duplicated the model and decoder here


def initialize_new_model():
    model = Model(hidden_channels=32).to(device)

    # Due to lazy initialization, we need to run one model step so the number
    # of parameters can be inferred:
    with torch.no_grad():
        model.encoder(train_data.x_dict, train_data.edge_index_dict)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    return model, optimizer

def train_one_epoch(model, optimizer, edge_type):
    model.train()
    optimizer.zero_grad()
    z_dict, pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data[edge_type].edge_label_index, edge_type)
    target = train_data[edge_type].edge_label
    loss = F.binary_cross_entropy_with_logits(pred, target)
    loss.backward()
    optimizer.step()
    return z_dict, float(loss)


@torch.no_grad()
def test(data, model, edge_type):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data[edge_type].edge_label_index, edge_type)
    # pred = pred.clamp(min=0, max=5)
    target = data[edge_type].edge_label
    loss = F.binary_cross_entropy_with_logits(pred, target)
    return float(loss)

def train(n_epochs, model, optimizer, edge_type):
    train_loss = list()
    val_loss = list()
    test_loss = list()
    
    for epoch in range(1, n_epochs):
        z_dict, loss = train_one_epoch(model, optimizer, edge_type) # I don't need the z_dicts at each epoch, I will retrieve all of them at the end
        # train_bcewl = test(train_data, model, edge_type)
        # val_rmse = test(val_data, model, edge_type)
        # test_rmse = test(test_data, model, edge_type)

        train_loss.append(loss)
        # val_loss.append(val_rmse)
        # test_loss.append(test_rmse)
    # print(z_dict)
    return z_dict, train_loss, val_loss, test_loss


# create a dictionary of models and training/val/test metrics in order to make predictions later
models_per_edge_type_dict = dict()

for edge_type in edge_types: # edge_types is a list of tuples like ('node', 'edge', 'node')
    model, optimizer = initialize_new_model()
    z_dict, train_loss, val_loss, test_loss = train(10, model, optimizer, edge_type)
    models_per_edge_type_dict[edge_type[1]] = {
        "model": model,
        "optimizier": optimizer,
        "z_dict": z_dict, # node embeddings for each type of edge for which the model has been trained on
        "losses": {
            "train": train_loss,
            "val": val_loss,
            "test": test_loss
        }
    }

I don't do testing just yet, I am trying to get the training part to work.

Not sure if this is the best approach, but I am creating a Model instance for each node type since in the examples I've seen that the EdgeDecoder was using a specific edge type for training, and then I am saving the models in a dictionary. This is off-topic but is there a better way to create the EdgeDecoder considering all edge types all at once?

Back to the original question, this is the error I am getting:

Traceback (most recent call last):
  File "/Users/cbrumar/.local/share/virtualenvs/gnn-TG0lFQrB/lib/python3.9/site-packages/torch/fx/graph_module.py", line 622, in wrapped_call
    return super(type(self), self).__call__(*args, **kwargs)
  File "/Users/cbrumar/.local/share/virtualenvs/gnn-TG0lFQrB/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "<eval_with_key>.9", line 25, in forward
    relu__B = None.relu()
AttributeError: 'NoneType' object has no attribute 'relu'

Call using an FX-traced Module, line 25 of the traced Module's generated forward function:
    relu__A = conv1__A.relu();  conv1__A = None
    relu__B = None.relu()

~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
    relu__C = conv1__C.relu();  conv1__C = None

    conv2__A1 = self.conv2.A__A_to_A_edge__A((relu__A, relu__A), edge_index__A__A_to_A_edge__A);  edge_index__A__A_to_A_edge__A = None

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Input In [184], in <cell line: 96>()
     94 models_per_edge_type_dict = dict()
     96 for edge_type in edge_types: # edge_types is a list of tuples like ('node', 'edge', 'node')
---> 97     model, optimizer = initialize_new_model()
     98     z_dict, train_loss, val_loss, test_loss = train(10, model, optimizer, edge_type)
     99     models_per_edge_type_dict[edge_type[1]] = {
    100         "model": model,
    101         "optmizier": optimizer,
   (...)
    107         }
    108     }

Input In [184], in initialize_new_model()
     44 # Due to lazy initialization, we need to run one model step so the number
     45 # of parameters can be inferred:
     46 with torch.no_grad():
---> 47     model.encoder(train_data.x_dict, train_data.edge_index_dict)
     49 optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
     51 return model, optimizer

File ~/.local/share/virtualenvs/gnn-TG0lFQrB/lib/python3.9/site-packages/torch/fx/graph_module.py:630, in GraphModule.recompile.<locals>.wrapped_call(self, *args, **kwargs)
    627 if "eval_with_key" in topmost_framesummary.filename:
    628     print(generate_error_message(topmost_framesummary),
    629           file=sys.stderr)
--> 630 raise e.with_traceback(None)

AttributeError: 'NoneType' object has no attribute 'relu'

Something that I've tried before was usingRandomLinkSplit, which required me to create the reverse edges. I did that by using T.ToUndirected(reduce=False)(data). That would create reverse edges for all edges except for (A, A_to_A_edge, A) and (C, C_to_C_edge, C). This will lead to an error when using RandomLinkSplit:

---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Input In [200], in <cell line: 1>()
----> 1 train_data, val_data, test_data = T.RandomLinkSplit(
      2     num_val=0.0,
      3     num_test=0.0,
      4     neg_sampling_ratio=0.0,
      5     edge_types=edge_types_no_rev,
      6     rev_edge_types=rev_edge_types
      7 )(data)

File ~/.local/share/virtualenvs/gnn-TG0lFQrB/lib/python3.9/site-packages/torch_geometric/transforms/random_link_split.py:108, in RandomLinkSplit.__init__(self, num_val, num_test, is_undirected, key, split_labels, add_negative_train_samples, neg_sampling_ratio, disjoint_train_ratio, edge_types, rev_edge_types)
    106 if isinstance(edge_types, list):
    107     assert isinstance(rev_edge_types, list)
--> 108     assert len(edge_types) == len(rev_edge_types)

AssertionError:

Meaning that I don't have as many reversed edges as original edges.

Thanks for all the help in advance!

Aug 14 '22 16:08 kamibrumi

Yes, I think this is related to missing edge types that point to node type B. I think this experience will be smoother in the upcoming release.

Within RandomLinkSplit, you can use None as the rev_edge_type for A<>A and C<>C.

In addition, I don‘t think you need to necessarily create a new model for predicting every edge type. You can potentially train this end-to-end together.

Aug 14 '22 20:08 rusty1s

Thanks so much for the fast response, it worked:) I have a couple of follow-up questions:

Could you explain why we need these reverse edges or share a reference that explains this, please? What is the implication of saying that there are no reverse edges for a certain pair of node types?
The example I was following was doing it by edge type, would you mind sharing an example for link prediction where the training is done end-to-end, please? -- I am pretty new to PyG and there are few examples on Link Prediction, any help will be much appreciated!

Aug 14 '22 23:08 kamibrumi

If you have no reverse edges, this means that you have node types which will never get updated during message passing - as such, the feature representation of that node type will be None - leading to the above error. This is generally related to to_hetero which learns a GNN layer per edge type, and aggregates the results coming from multiple edge types to the destination node type. A workaround would be to use the old feature state in that case, but this also leads to problems in case the feature dimensions of nodes changes over the number of layers. This is just briefly mentioned in the R-GCN paper (footnote 1 on page 2).
Sure, the goal is that EdgeDecoder performs prediction for each edge type

class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1_dict = ModuleDict()
        self.lin2_dict = ModuleDict()
        for edge_type in edge_types:
            e = '__'.join(edge_type)
            self.lin1_dict[e] = Linear(2 * hidden_channels, hidden_channels)
            self.lin2_dict[e] = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index_dict):
        outs = []
        for edge_type, edge_label_index_dict in edge_label_index_dict.items():
            e = '__'.join(edge_type)
            src, _, dst = edge_type
            row, col = edge_label_index
            z = torch.cat([z_dict[src][row], z_dict[dst][col]], dim=-1)
            z = self.lin1_dict[e](z).relu()
            z = self.lin2_dict[e](z)
            outs.append(z.view(-1))
        return torch.cat(outs)

Aug 15 '22 04:08 rusty1s

pytorch_geometric pytorch_geometric copied to clipboard

Issue with GNN using own heterogeneous dataset

pytorch_geometric
pytorch_geometric copied to clipboard