1. 前言
最近在学习图神经网络相关知识,对于直推式的图神经网络,训练代价昂贵,这篇文章主要是介绍一个基于归纳学习的框架GraphSAGE的代码,旨在训练一个聚合函数,为看不见的节点(新的节点)生成嵌入。因为自己也是小白,写这篇文章的目的也是为了增强自己对该算法的理解和记忆,由于下载下来的代码没有注释,我会尽可能的加上足够清晰的注释,方便大家阅读,如有错误,望神仙网友给予批评指正!!!
2. 代码下载
该代码是从github上下载而来,使用pytorch框架的一个简易版的GraphSAGE算法,适合小白入手学习。 代码下载链接:https://pan.baidu.com/s/1WW0mkHXupl6kkyyzOG9pBA 提取码:v06v
3. 数据集分析
代码中提供了两种数据集,cora数据集和pubmed数据集,主要针对cora数据集进行分析。 Cora数据集中样本是机器学习论文,论文被分为7类:
- Case_Based
- Genetic_Algorithms
- Neural_Networks
- Probabilistic_Methods
- Reinforcement_Learning
- Rule_Learning
- Theory
数据集共有2708篇论文,分为两个文件:
- cora.cites
- cora.content
第一个文件cora.cites文件格式:
<paper_id> <word_attributes>+ <class_label>
<paper_id> :论文的ID(或者说图中节点的ID编号)
<word_attributes>:节点的特征向量(0-1编码)
<class_label>:节点类别
第一个文件cora.cites文件格式:
<ID of cited paper> <ID of citing paper>
<ID of cited paper>:被引用的论文ID
<ID of citing paper>:引用论文的ID
我们可以把它看作是图中两个节点ID的边
4. 代码分析
主要有三个代码文件:aggregators.py、encoders.py、model.py aggregators.py:用于聚合邻居节点的特征,返回的就是聚合后的邻居节点的特征。 encoders.py:根据aggregators得到的邻居节点特征执行卷积操作 model.py:代码的主文件,加载数据集以及训练代码等操作
4. 1 model.py
首先是一个用于创建GraphSage的SupervisedGraphSage类。
class SupervisedGraphSage(nn.Module):
def __init__(self, num_classes, enc):
super(SupervisedGraphSage, self).__init__()
self.enc = enc
self.xent = nn.CrossEntropyLoss()
self.weight = nn.Parameter(torch.FloatTensor(num_classes, enc.embed_dim))
init.xavier_uniform(self.weight)
def forward(self, nodes):
embeds = self.enc(nodes)
scores = self.weight.mm(embeds)
return scores.t()
def loss(self, nodes, labels):
scores = self.forward(nodes)
return self.xent(scores, labels.squeeze())
上面的代码比较简单,下面看一下加载数据集的代码模块。
def load_cora():
num_nodes = 2708
num_feats = 1433
feat_data = np.zeros((num_nodes, num_feats))
labels = np.empty((num_nodes,1), dtype=np.int64)
node_map = {}
label_map = {}
with open("D:\workspace\pythonSpace\pythonProject\\nlp\graphNetwork\graphsage-simple-master\cora\cora.content") as fp:
for i,line in enumerate(fp):
'''把文件中的每一行读出来,info含有三部分:
info[0]:节点的编号ID
info[1:-1]:节点对应的特征 -> 1433维
info[-1]:节点对应的类别
'''
info = line.strip().split()
feat_data[i,:] = list(map(float, info[1:-1]))
node_map[info[0]] = i
if not info[-1] in label_map:
label_map[info[-1]] = len(label_map)
labels[i] = label_map[info[-1]]
adj_lists = defaultdict(set)
with open("D:\workspace\pythonSpace\pythonProject\\nlp\graphNetwork\graphsage-simple-master\cora\cora.cites") as fp:
for i,line in enumerate(fp):
'''
info有两部分组成
info[0]:被引用论文的ID
info[1]:引用论文的ID '''
info = line.strip().split()
paper1 = node_map[info[0]]
paper2 = node_map[info[1]]
adj_lists[paper1].add(paper2)
adj_lists[paper2].add(paper1)
return feat_data, labels, adj_lists
下面看一下训练函数
def run_cora():
np.random.seed(1)
random.seed(1)
num_nodes = 2708
feat_data, labels, adj_lists = load_cora()
features = nn.Embedding(2708, 1433)
features.weight = nn.Parameter(torch.FloatTensor(feat_data), requires_grad=False)
agg1 = MeanAggregator(features, cuda=True)
enc1 = Encoder(features, 1433, 128, adj_lists, agg1, gcn=True, cuda=False)
agg2 = MeanAggregator(lambda nodes : enc1(nodes).t(), cuda=False)
enc2 = Encoder(lambda nodes : enc1(nodes).t(), enc1.embed_dim, 128, adj_lists, agg2,
base_model=enc1, gcn=True, cuda=False)
enc1.num_samples = 5
enc2.num_samples = 5
graphsage = SupervisedGraphSage(7, enc2)
rand_indices = np.random.permutation(num_nodes)
test = rand_indices[:1000]
val = rand_indices[1000:1500]
train = list(rand_indices[1500:])
optimizer = torch.optim.SGD(filter(lambda p : p.requires_grad, graphsage.parameters()), lr=0.7)
times = []
for batch in range(100):
batch_nodes = train[:256]
random.shuffle(train)
start_time = time.time()
optimizer.zero_grad()
loss = graphsage.loss(batch_nodes,
Variable(torch.LongTensor(labels[np.array(batch_nodes)])))
loss.backward()
optimizer.step()
end_time = time.time()
times.append(end_time-start_time)
print( batch, loss.data)
val_output = graphsage.forward(val)
print( "Validation F1:", f1_score(labels[val], val_output.data.numpy().argmax(axis=1), average="micro"))
print ("Average batch time:", np.mean(times))
4. 2 aggregators.py
下面我们看一下GraphSAGE如何获得邻居节点的特征。
class MeanAggregator(nn.Module):
"""
Aggregates a node's embeddings using mean of neighbors' embeddings
"""
def __init__(self, features, cuda=False, gcn=False):
"""
Initializes the aggregator for a specific graph.
features -- function mapping LongTensor of node ids to FloatTensor of feature values.
cuda -- whether to use GPU
gcn --- whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style
"""
super(MeanAggregator, self).__init__()
self.features = features
self.cuda = cuda
self.gcn = gcn
def forward(self, nodes, to_neighs, num_sample=10):
"""
nodes --- 一个批次的节点编号
to_neighs --- 每个节点对应的邻居节点编号集合
num_sample --- 每个节点对邻居的采样数量
"""
_set = set
if not num_sample is None:
_sample = random.sample
samp_neighs = [_set(_sample(to_neigh,
num_sample,
)) if len(to_neigh) >= num_sample else to_neigh for to_neigh in to_neighs]
else:
samp_neighs = to_neighs
if self.gcn:
samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]
unique_nodes_list = list(set.union(*samp_neighs))
unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes)))
column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh]
row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))]
mask[row_indices, column_indices] = 1
if self.cuda:
mask = mask.cuda()
num_neigh = mask.sum(1, keepdim=True)
mask = mask.div(num_neigh)
embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
to_feats = mask.mm(embed_matrix)
return to_feats
4. 3 encoders.py
得到聚合了邻居节点的特征向量之后,执行卷积的操作如下:
class Encoder(nn.Module):
"""
Encodes a node's using 'convolutional' GraphSage approach
"""
def __init__(self, features, feature_dim,
embed_dim, adj_lists, aggregator,
num_sample=10,
base_model=None, gcn=False, cuda=False,
feature_transform=False):
super(Encoder, self).__init__()
self.features = features
self.feat_dim = feature_dim
self.adj_lists = adj_lists
self.aggregator = aggregator
self.num_sample = num_sample
if base_model != None:
self.base_model = base_model
self.gcn = gcn
self.embed_dim = embed_dim
self.cuda = cuda
self.aggregator.cuda = cuda
self.weight = nn.Parameter(
torch.FloatTensor(embed_dim, self.feat_dim if self.gcn else 2 * self.feat_dim))
init.xavier_uniform(self.weight)
def forward(self, nodes):
"""
Generates embeddings for a batch of nodes.
nodes -- list of nodes
"""
neigh_feats = self.aggregator.forward(nodes, [self.adj_lists[int(node)] for node in nodes],
self.num_sample)
if not self.gcn:
if self.cuda:
self_feats = self.features(torch.LongTensor(nodes).cuda())
else:
self_feats = self.features(torch.LongTensor(nodes))
combined = torch.cat([self_feats, neigh_feats], dim=1)
else:
combined = neigh_feats
combined = F.relu(self.weight.mm(combined.t()))
return combined
5 总结
以上就是实现了均值MeanAggregator的GraphSAGE的算法,我尽可能多的为每一行代码加上了注释,如有错误,望批评指正。 除了上面的均值聚合方式,还有LSTM、池化聚合方式,还有无监督的GraphSAGE训练方式,如果有机会,争取在后面学习之后再写一篇博文分享出来。
|