Intro to PGFrames and semantic encoding¶
This tutorial will help you to get started with property graph data structure PGFrame provided by BlueGraph, get an example of semantic property encoding. The source notebook can be found here.
import random
import numpy as np
import pandas as pd
from nltk.corpus import words
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder
Example 1: small property graph¶
Intialize a PandasPGFrame
given a node and edge list.
nodes = ["Alice", "Bob", "Eric", "John", "Anna", "Laura", "Matt"]
sources = [
"Alice", "Alice", "Bob", "Bob", "Bob", "Eric", "Anna", "Anna", "Matt"
]
targets = [
"Bob", "Eric", "Eric", "John", "Anna", "Anna", "Laura", "John", "John"
]
edges = list(zip(sources, targets))
frame = PandasPGFrame(nodes=nodes, edges=edges)
Get nodes and edges as lists.
frame.nodes()
['Alice', 'Bob', 'Eric', 'John', 'Anna', 'Laura', 'Matt']
frame.edges()
[('Alice', 'Bob'),
('Alice', 'Eric'),
('Bob', 'Eric'),
('Bob', 'John'),
('Bob', 'Anna'),
('Eric', 'Anna'),
('Anna', 'Laura'),
('Anna', 'John'),
('Matt', 'John')]
Add properties to nodes and edges. Here, all the properties have type
numeric
. Other available types are: categorical
and text
.
age = [25, 9, 70, 42, 26, 35, 36]
frame.add_node_properties(
{
"@id": nodes,
"age": age
}, prop_type="numeric")
height = [180, 122, 173, 194, 172, 156, 177]
frame.add_node_properties(
{
"@id": nodes,
"height": height
}, prop_type="numeric")
weight = [75, 43, 68, 82, 70, 59, 81]
frame.add_node_properties(
{
"@id": nodes,
"weight": weight
}, prop_type="numeric")
weights = [1.0, 2.2, 0.3, 4.1, 1.5, 21.0, 1.0, 2.5, 7.5]
edge_weight = pd.DataFrame({
"@source_id": sources,
"@target_id": targets,
"distance": weights
})
frame.add_edge_properties(edge_weight, prop_type="numeric")
Get nodes and edges as dataframes.
frame.nodes(raw_frame=True).sample(5)
age | height | weight | |
---|---|---|---|
@id | |||
Bob | 9 | 122 | 43 |
Eric | 70 | 173 | 68 |
Anna | 26 | 172 | 70 |
Matt | 36 | 177 | 81 |
Alice | 25 | 180 | 75 |
frame.edges(raw_frame=True).sample(5)
distance | ||
---|---|---|
@source_id | @target_id | |
Bob | John | 4.1 |
Anna | John | 2.5 |
Bob | Anna | 1.5 |
Eric | 0.3 | |
Alice | Bob | 1.0 |
Example 2: Random graph with a given density¶
In this example we will generate a small random graph given a specified density value (i.e. ratio of edges realized of all possible edges between distinct pairs of nodes).
Create a PandasPGFrame¶
N = 70 # number of nodes
density = 0.1 # density value
# Helper functions for graph generation
def generate_targets(nodes, s, density=0.2):
edges = []
for t in nodes:
if s < t:
edge = np.random.choice([0, 1], p=[1 - density, density])
if edge:
edges.append([s, t])
return edges
def random_pgframe(n_nodes, density):
nodes = list(range(n_nodes))
edges = sum(
map(lambda x: generate_targets(nodes, x, density), nodes), [])
edges = pd.DataFrame(
edges, columns=["@source_id", "@target_id"])
edges_df = edges.set_index(["@source_id", "@target_id"])
frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
return frame
graph_frame = random_pgframe(N, density)
Get nodes and edges as dataframes.
graph_frame.nodes(raw_frame=True).sample(5)
@id |
---|
15 |
27 |
36 |
68 |
11 |
graph_frame.edges(raw_frame=True).sample(5)
@source_id | @target_id |
---|---|
16 | 63 |
58 | |
25 | 52 |
23 | 59 |
25 | 43 |
Add node and edge types¶
Here we generate random types for nodes and edges.
types = ["Apple", "Orange", "Carrot"]
node_types = {
n: np.random.choice(types, p=[0.5, 0.4, 0.1])
for n in range(N)
}
graph_frame.add_node_types(node_types)
graph_frame.nodes(raw_frame=True).sample(5)
@type | |
---|---|
@id | |
14 | Apple |
64 | Apple |
18 | Carrot |
50 | Carrot |
20 | Orange |
types = ["isFriend", "isEnemy"]
edge_types = {
e: np.random.choice(types, p=[0.8, 0.2])
for e in graph_frame.edges()
}
graph_frame.add_edge_types(edge_types)
graph_frame.edges(raw_frame=True).sample(5)
@type | ||
---|---|---|
@source_id | @target_id | |
67 | 68 | isFriend |
41 | 66 | isEnemy |
16 | 30 | isFriend |
17 | 37 | isFriend |
21 | 31 | isFriend |
Add node and edge properties¶
We add node properties of different data types (numeric
,
categorical
, text
) randomly.
weight = pd.DataFrame(
[
(n, np.random.normal(loc=35, scale=5))
for n in graph_frame.nodes()
],
columns=["@id", "weight"]
)
graph_frame.add_node_properties(weight, prop_type="numeric")
colors = ["red", "green", "blue"]
colors = pd.DataFrame(
[
(n, np.random.choice(colors))
for n in graph_frame.nodes()
],
columns=["@id", "color"]
)
graph_frame.add_node_properties(colors, prop_type="category")
desc = pd.DataFrame(
[
(n, ' '.join(random.sample(words.words(), 20)))
for n in graph_frame.nodes()
],
columns=["@id", "desc"]
)
graph_frame.add_node_properties(desc, prop_type="text")
graph_frame.nodes(raw_frame=True).sample(5)
@type | weight | color | desc | |
---|---|---|---|---|
@id | ||||
13 | Orange | 40.386831 | blue | cutterhead amanuenses Kashubian Alchornea skin... |
8 | Carrot | 29.168627 | blue | probe menorrhoeic hemicephalous comart gander ... |
29 | Apple | 35.391697 | blue | teruncius tetanoid unsovereign carpocarpal unr... |
10 | Apple | 37.038171 | green | balloter preceding scabies lengthways lotase o... |
18 | Carrot | 32.094158 | green | oiled sphericle relationism neostriatum molehi... |
graph_frame._node_prop_types
{'@type': 'category', 'weight': 'numeric', 'color': 'category', 'desc': 'text'}
We add edge properties of different data types (numeric
,
categorical
, text
) randomly.
years = pd.DataFrame(
[
(s, t, np.random.randint(0, 20))
for s, t in graph_frame.edges()
],
columns=["@source_id", "@target_id", "n_years"]
)
graph_frame.add_edge_properties(years, prop_type="numeric")
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
[
(s, t, np.random.choice(shapes))
for s, t, in graph_frame.edges()
],
columns=["@source_id", "@target_id", "shapes"]
)
graph_frame.add_edge_properties(shapes, prop_type="category")
desc = pd.DataFrame(
[
(s, t, ' '.join(random.sample(words.words(), 20)))
for s, t, in graph_frame.edges()
],
columns=["@source_id", "@target_id", "desc"]
)
graph_frame.add_edge_properties(desc, prop_type="text")
graph_frame.edges(raw_frame=True).sample(5)
@type | n_years | shapes | desc | ||
---|---|---|---|---|---|
@source_id | @target_id | ||||
8 | 13 | isFriend | 14 | dotted | preconize Berycidae shopmaid tanyard topi piac... |
18 | 34 | isFriend | 4 | dashed | Sterope undermusic lorn sorbefacient Sabbatize... |
21 | 30 | isFriend | 12 | dashed | octadic teleozoic elderberry confirm stigmario... |
1 | 69 | isFriend | 10 | solid | leptocephalia Anglist uncorresponding parafloc... |
25 | 66 | isEnemy | 12 | dashed | Iswara myodynamia barken black timoneer defloc... |
graph_frame._edge_prop_types
{'@type': 'category',
'n_years': 'numeric',
'shapes': 'category',
'desc': 'text'}
Perform semantic encoding of properties¶
BlueGraph allows to convert node/edge properties of different data types into numerical vectors.
NB: If nltk error occurs, run the following code (the ‘words’ corpus needs to be downloaded for semantic encoding of text properties):
import nltk
nltk.download('words')
Create a encoder object for homogeneous encoding (properties of all the nodes (edges) are encoded with feature vectors of the same length independently of their type).
hom_encoder = ScikitLearnPGEncoder(
node_properties=["weight", "color", "desc"],
edge_properties=["n_years", "shapes", "desc"],
edge_features=True,
heterogeneous=False,
encode_types=True,
drop_types=True,
text_encoding="tfidf",
standardize_numeric=True)
transformed_frame = hom_encoder.fit_transform(graph_frame)
transformed_frame.nodes(raw_frame=True).sample(5)
features | |
---|---|
@id | |
25 | [-0.9693465349258025, 0.0, 1.0, 0.0, 0.0, 0.0,... |
59 | [1.0407324935966866, 0.0, 1.0, 0.0, 0.0, 0.0, ... |
40 | [0.22089544697164212, 0.0, 1.0, 0.0, 0.0, 0.0,... |
12 | [1.521323313308059, 0.0, 0.0, 1.0, 0.0, 0.0, 0... |
62 | [-1.2547871487822837, 0.0, 0.0, 1.0, 0.0, 0.0,... |
We can inspect encoding models for different node and edge properties created by BlueGraph.
hom_encoder._node_encoders
{'weight': StandardScaler(),
'color': MultiLabelBinarizer(),
'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}
transformed_frame.edges(raw_frame=True).sample(5)
features | ||
---|---|---|
@source_id | @target_id | |
54 | 57 | [-0.2198738883485877, 1.0, 0.0, 0.0, 0.0, 0.0,... |
29 | 40 | [-0.7501579720128285, 0.0, 1.0, 0.0, 0.0, 0.0,... |
2 | 14 | [0.48717155653706673, 0.0, 0.0, 1.0, 0.0, 0.0,... |
15 | 49 | [-1.6339647781198965, 0.0, 0.0, 1.0, 0.0, 0.0,... |
18 | 33 | [0.3104101953156531, 0.0, 0.0, 1.0, 0.0, 0.0, ... |
hom_encoder._edge_encoders
{'n_years': StandardScaler(),
'shapes': MultiLabelBinarizer(),
'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}
Convert PGFrames to JSON¶
json_repr = graph_frame.to_json()
json_repr["nodes"][:2]
[{'@id': 0,
'@type': 'Apple',
'weight': 36.53863443435658,
'color': 'green',
'desc': 'Trinitarian undyeable fearedness quinquelobated thermanalgesia unanimous branchful Septentrion deerherd mispleading timbern mechanal papaphobist rowanberry admeasurement disilicide yade undertake innoxiously epiphanous'},
{'@id': 1,
'@type': 'Orange',
'weight': 37.24906812781439,
'color': 'blue',
'desc': 'orderer interpellator acouometer though unpoisonable delegation Yellowknife professorial forenotice computational subinternal weepable cliental microtelephone chandleress feroher falltime consociation theoleptic eustomatous'}]
json_repr["edges"][:2]
[{'@source_id': 0,
'@target_id': 25,
'@type': 'isFriend',
'n_years': 0,
'shapes': 'dotted',
'desc': 'nonsetter noncontent xenelasia ozokerite speiss smithing unillumination stenographer unappeasedly bookling buttgenbachite saxhorn tideless pterygote pix topply spraint wherethrough largen seminebulous'},
{'@source_id': 0,
'@target_id': 33,
'@type': 'isFriend',
'n_years': 15,
'shapes': 'dashed',
'desc': 'traily scagliolist maintenance semipectoral cycloolefin pyovesiculosis reptatorial upsilon rotatodentate determiner marbler benzonitrol sandust cystolithectomy volatilization spiritistic micropterygid unegoistical Rosicrucianism meteorography'}]
Create a new PandasPGFrame
from the generated representation.
new_frame = PandasPGFrame.from_json(json_repr)
new_frame.nodes(raw_frame=True).sample(5)
@type | weight | color | desc | |
---|---|---|---|---|
@id | ||||
40 | Orange | 36.165271 | green | Mareotic dracontian tartrazine cholelithotomy ... |
38 | Apple | 40.665344 | red | ballet ensuer congressionalist unicellular Het... |
28 | Carrot | 35.038295 | green | salicorn outgrowing compensatory vorticism bah... |
13 | Orange | 40.386831 | blue | cutterhead amanuenses Kashubian Alchornea skin... |
55 | Orange | 34.850857 | green | overdrowsed uncommuted recital joyful oxidizab... |