Intro to PGFrames and semantic encoding

This tutorial will help you to get started with property graph data structure PGFrame provided by BlueGraph, get an example of semantic property encoding. The source notebook can be found here.

import random

import numpy as np
import pandas as pd

from nltk.corpus import words
from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

Example 1: small property graph

Intialize a PandasPGFrame given a node and edge list.

nodes = ["Alice", "Bob", "Eric", "John", "Anna", "Laura", "Matt"]

sources = [
    "Alice", "Alice", "Bob", "Bob", "Bob", "Eric", "Anna", "Anna", "Matt"
]
targets = [
    "Bob", "Eric", "Eric", "John", "Anna", "Anna", "Laura", "John", "John"
]
edges = list(zip(sources, targets))

frame = PandasPGFrame(nodes=nodes, edges=edges)

Get nodes and edges as lists.

frame.nodes()
['Alice', 'Bob', 'Eric', 'John', 'Anna', 'Laura', 'Matt']
frame.edges()
[('Alice', 'Bob'),
 ('Alice', 'Eric'),
 ('Bob', 'Eric'),
 ('Bob', 'John'),
 ('Bob', 'Anna'),
 ('Eric', 'Anna'),
 ('Anna', 'Laura'),
 ('Anna', 'John'),
 ('Matt', 'John')]

Add properties to nodes and edges. Here, all the properties have type numeric. Other available types are: categorical and text.

age = [25, 9, 70, 42, 26, 35, 36]
frame.add_node_properties(
    {
        "@id": nodes,
        "age": age
    }, prop_type="numeric")

height = [180, 122, 173, 194, 172, 156, 177]
frame.add_node_properties(
    {
        "@id": nodes,
        "height": height
    }, prop_type="numeric")

weight = [75, 43, 68, 82, 70, 59, 81]
frame.add_node_properties(
    {
        "@id": nodes,
        "weight": weight
    }, prop_type="numeric")


weights = [1.0, 2.2, 0.3, 4.1, 1.5, 21.0, 1.0, 2.5, 7.5]
edge_weight = pd.DataFrame({
    "@source_id": sources,
    "@target_id": targets,
    "distance": weights
})
frame.add_edge_properties(edge_weight, prop_type="numeric")

Get nodes and edges as dataframes.

frame.nodes(raw_frame=True).sample(5)
age height weight
@id
Bob 9 122 43
Eric 70 173 68
Anna 26 172 70
Matt 36 177 81
Alice 25 180 75
frame.edges(raw_frame=True).sample(5)
distance
@source_id @target_id
Bob John 4.1
Anna John 2.5
Bob Anna 1.5
Eric 0.3
Alice Bob 1.0

Example 2: Random graph with a given density

In this example we will generate a small random graph given a specified density value (i.e. ratio of edges realized of all possible edges between distinct pairs of nodes).

Create a PandasPGFrame

N = 70  # number of nodes
density = 0.1  # density value
# Helper functions for graph generation

def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:

                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame
graph_frame = random_pgframe(N, density)

Get nodes and edges as dataframes.

graph_frame.nodes(raw_frame=True).sample(5)
@id
15
27
36
68
11
graph_frame.edges(raw_frame=True).sample(5)
@source_id @target_id
16 63
58
25 52
23 59
25 43

Add node and edge types

Here we generate random types for nodes and edges.

types = ["Apple", "Orange", "Carrot"]
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}
graph_frame.add_node_types(node_types)
graph_frame.nodes(raw_frame=True).sample(5)
@type
@id
14 Apple
64 Apple
18 Carrot
50 Carrot
20 Orange
types = ["isFriend", "isEnemy"]
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}
graph_frame.add_edge_types(edge_types)
graph_frame.edges(raw_frame=True).sample(5)
@type
@source_id @target_id
67 68 isFriend
41 66 isEnemy
16 30 isFriend
17 37 isFriend
21 31 isFriend

Add node and edge properties

We add node properties of different data types (numeric, categorical, text) randomly.

weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ],
    columns=["@id", "weight"]
)
graph_frame.add_node_properties(weight, prop_type="numeric")
colors = ["red", "green", "blue"]
colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ],
    columns=["@id", "color"]
)
graph_frame.add_node_properties(colors, prop_type="category")
desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ],
    columns=["@id", "desc"]
)
graph_frame.add_node_properties(desc, prop_type="text")
graph_frame.nodes(raw_frame=True).sample(5)
@type weight color desc
@id
13 Orange 40.386831 blue cutterhead amanuenses Kashubian Alchornea skin...
8 Carrot 29.168627 blue probe menorrhoeic hemicephalous comart gander ...
29 Apple 35.391697 blue teruncius tetanoid unsovereign carpocarpal unr...
10 Apple 37.038171 green balloter preceding scabies lengthways lotase o...
18 Carrot 32.094158 green oiled sphericle relationism neostriatum molehi...
graph_frame._node_prop_types
{'@type': 'category', 'weight': 'numeric', 'color': 'category', 'desc': 'text'}

We add edge properties of different data types (numeric, categorical, text) randomly.

years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ],
    columns=["@source_id", "@target_id", "n_years"]
)
graph_frame.add_edge_properties(years, prop_type="numeric")
shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ],
    columns=["@source_id", "@target_id", "shapes"]
)
graph_frame.add_edge_properties(shapes, prop_type="category")
desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ],
    columns=["@source_id", "@target_id", "desc"]
)
graph_frame.add_edge_properties(desc, prop_type="text")
graph_frame.edges(raw_frame=True).sample(5)
@type n_years shapes desc
@source_id @target_id
8 13 isFriend 14 dotted preconize Berycidae shopmaid tanyard topi piac...
18 34 isFriend 4 dashed Sterope undermusic lorn sorbefacient Sabbatize...
21 30 isFriend 12 dashed octadic teleozoic elderberry confirm stigmario...
1 69 isFriend 10 solid leptocephalia Anglist uncorresponding parafloc...
25 66 isEnemy 12 dashed Iswara myodynamia barken black timoneer defloc...
graph_frame._edge_prop_types
{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

Perform semantic encoding of properties

BlueGraph allows to convert node/edge properties of different data types into numerical vectors.

NB: If nltk error occurs, run the following code (the ‘words’ corpus needs to be downloaded for semantic encoding of text properties):

import nltk
nltk.download('words')

Create a encoder object for homogeneous encoding (properties of all the nodes (edges) are encoded with feature vectors of the same length independently of their type).

hom_encoder = ScikitLearnPGEncoder(
    node_properties=["weight", "color", "desc"],
    edge_properties=["n_years", "shapes", "desc"],
    edge_features=True,
    heterogeneous=False,
    encode_types=True,
    drop_types=True,
    text_encoding="tfidf",
    standardize_numeric=True)
transformed_frame = hom_encoder.fit_transform(graph_frame)
transformed_frame.nodes(raw_frame=True).sample(5)
features
@id
25 [-0.9693465349258025, 0.0, 1.0, 0.0, 0.0, 0.0,...
59 [1.0407324935966866, 0.0, 1.0, 0.0, 0.0, 0.0, ...
40 [0.22089544697164212, 0.0, 1.0, 0.0, 0.0, 0.0,...
12 [1.521323313308059, 0.0, 0.0, 1.0, 0.0, 0.0, 0...
62 [-1.2547871487822837, 0.0, 0.0, 1.0, 0.0, 0.0,...

We can inspect encoding models for different node and edge properties created by BlueGraph.

hom_encoder._node_encoders
{'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}
transformed_frame.edges(raw_frame=True).sample(5)
features
@source_id @target_id
54 57 [-0.2198738883485877, 1.0, 0.0, 0.0, 0.0, 0.0,...
29 40 [-0.7501579720128285, 0.0, 1.0, 0.0, 0.0, 0.0,...
2 14 [0.48717155653706673, 0.0, 0.0, 1.0, 0.0, 0.0,...
15 49 [-1.6339647781198965, 0.0, 0.0, 1.0, 0.0, 0.0,...
18 33 [0.3104101953156531, 0.0, 0.0, 1.0, 0.0, 0.0, ...
hom_encoder._edge_encoders
{'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

Convert PGFrames to JSON

json_repr = graph_frame.to_json()
json_repr["nodes"][:2]
[{'@id': 0,
  '@type': 'Apple',
  'weight': 36.53863443435658,
  'color': 'green',
  'desc': 'Trinitarian undyeable fearedness quinquelobated thermanalgesia unanimous branchful Septentrion deerherd mispleading timbern mechanal papaphobist rowanberry admeasurement disilicide yade undertake innoxiously epiphanous'},
 {'@id': 1,
  '@type': 'Orange',
  'weight': 37.24906812781439,
  'color': 'blue',
  'desc': 'orderer interpellator acouometer though unpoisonable delegation Yellowknife professorial forenotice computational subinternal weepable cliental microtelephone chandleress feroher falltime consociation theoleptic eustomatous'}]
json_repr["edges"][:2]
[{'@source_id': 0,
  '@target_id': 25,
  '@type': 'isFriend',
  'n_years': 0,
  'shapes': 'dotted',
  'desc': 'nonsetter noncontent xenelasia ozokerite speiss smithing unillumination stenographer unappeasedly bookling buttgenbachite saxhorn tideless pterygote pix topply spraint wherethrough largen seminebulous'},
 {'@source_id': 0,
  '@target_id': 33,
  '@type': 'isFriend',
  'n_years': 15,
  'shapes': 'dashed',
  'desc': 'traily scagliolist maintenance semipectoral cycloolefin pyovesiculosis reptatorial upsilon rotatodentate determiner marbler benzonitrol sandust cystolithectomy volatilization spiritistic micropterygid unegoistical Rosicrucianism meteorography'}]

Create a new PandasPGFrame from the generated representation.

new_frame = PandasPGFrame.from_json(json_repr)
new_frame.nodes(raw_frame=True).sample(5)
@type weight color desc
@id
40 Orange 36.165271 green Mareotic dracontian tartrazine cholelithotomy ...
38 Apple 40.665344 red ballet ensuer congressionalist unicellular Het...
28 Carrot 35.038295 green salicorn outgrowing compensatory vorticism bah...
13 Orange 40.386831 blue cutterhead amanuenses Kashubian Alchornea skin...
55 Orange 34.850857 green overdrowsed uncommuted recital joyful oxidizab...