Intro to PGFrames and semantic encoding¶

This tutorial will help you to get started with property graph data structure PGFrame provided by BlueGraph, get an example of semantic property encoding. The source notebook can be found here.

import random

import numpy as np
import pandas as pd

from nltk.corpus import words

from bluegraph import PandasPGFrame
from bluegraph.preprocess import ScikitLearnPGEncoder
from bluegraph.backends.stellargraph import StellarGraphNodeEmbedder

Example 1: small property graph¶

Intialize a PandasPGFrame given a node and edge list.

nodes = ["Alice", "Bob", "Eric", "John", "Anna", "Laura", "Matt"]

sources = [
    "Alice", "Alice", "Bob", "Bob", "Bob", "Eric", "Anna", "Anna", "Matt"
]
targets = [
    "Bob", "Eric", "Eric", "John", "Anna", "Anna", "Laura", "John", "John"
]
edges = list(zip(sources, targets))

frame = PandasPGFrame(nodes=nodes, edges=edges)

Get nodes and edges as lists.

frame.nodes()

['Alice', 'Bob', 'Eric', 'John', 'Anna', 'Laura', 'Matt']

frame.edges()

[('Alice', 'Bob'),
 ('Alice', 'Eric'),
 ('Bob', 'Eric'),
 ('Bob', 'John'),
 ('Bob', 'Anna'),
 ('Eric', 'Anna'),
 ('Anna', 'Laura'),
 ('Anna', 'John'),
 ('Matt', 'John')]

Add properties to nodes and edges. Here, all the properties have type numeric. Other available types are: categorical and text.

age = [25, 9, 70, 42, 26, 35, 36]
frame.add_node_properties(
    {
        "@id": nodes,
        "age": age
    }, prop_type="numeric")

height = [180, 122, 173, 194, 172, 156, 177]
frame.add_node_properties(
    {
        "@id": nodes,
        "height": height
    }, prop_type="numeric")

weight = [75, 43, 68, 82, 70, 59, 81]
frame.add_node_properties(
    {
        "@id": nodes,
        "weight": weight
    }, prop_type="numeric")


weights = [1.0, 2.2, 0.3, 4.1, 1.5, 21.0, 1.0, 2.5, 7.5]
edge_weight = pd.DataFrame({
    "@source_id": sources,
    "@target_id": targets,
    "distance": weights
})
frame.add_edge_properties(edge_weight, prop_type="numeric")

Get nodes and edges as dataframes.

frame.nodes(raw_frame=True).sample(5)

	age	height	weight
@id
Bob	9	122	43
Eric	70	173	68
Anna	26	172	70
Matt	36	177	81
Alice	25	180	75

frame.edges(raw_frame=True).sample(5)

		distance
@source_id	@target_id
Bob	John	4.1
Anna	John	2.5
Bob	Anna	1.5
Bob	Eric	0.3
Alice	Bob	1.0

Example 2: Random graph with a given density¶

In this example we will generate a small random graph given a specified density value (i.e. ratio of edges realized of all possible edges between distinct pairs of nodes).

Create a PandasPGFrame¶

N = 70  # number of nodes
density = 0.1  # density value

# Helper functions for graph generation

def generate_targets(nodes, s, density=0.2):
    edges = []
    for t in nodes:
        if s < t:
            edge = np.random.choice([0, 1], p=[1 - density, density])
            if edge:

                edges.append([s, t])
    return edges


def random_pgframe(n_nodes, density):
    nodes = list(range(n_nodes))

    edges = sum(
        map(lambda x: generate_targets(nodes, x, density), nodes), [])
    edges = pd.DataFrame(
        edges, columns=["@source_id", "@target_id"])
    edges_df = edges.set_index(["@source_id", "@target_id"])
    frame = PandasPGFrame(nodes=nodes, edges=edges_df.index)
    return frame

graph_frame = random_pgframe(N, density)

Get nodes and edges as dataframes.

graph_frame.nodes(raw_frame=True).sample(5)


@id
15
27
36
68
11

graph_frame.edges(raw_frame=True).sample(5)


@source_id	@target_id
16	63
16	58
25	52
23	59
25	43

Add node and edge types¶

Here we generate random types for nodes and edges.

types = ["Apple", "Orange", "Carrot"]
node_types = {
    n: np.random.choice(types, p=[0.5, 0.4, 0.1])
    for n in range(N)
}

graph_frame.add_node_types(node_types)

graph_frame.nodes(raw_frame=True).sample(5)

	@type
@id
14	Apple
64	Apple
18	Carrot
50	Carrot
20	Orange

types = ["isFriend", "isEnemy"]
edge_types = {
    e: np.random.choice(types, p=[0.8, 0.2])
    for e in graph_frame.edges()
}

graph_frame.add_edge_types(edge_types)

graph_frame.edges(raw_frame=True).sample(5)

		@type
@source_id	@target_id
67	68	isFriend
41	66	isEnemy
16	30	isFriend
17	37	isFriend
21	31	isFriend

Add node and edge properties¶

We add node properties of different data types (numeric, categorical, text) randomly.

weight = pd.DataFrame(
    [
        (n, np.random.normal(loc=35, scale=5))
        for n in graph_frame.nodes()
    ],
    columns=["@id", "weight"]
)

graph_frame.add_node_properties(weight, prop_type="numeric")

colors = ["red", "green", "blue"]

colors = pd.DataFrame(
    [
        (n, np.random.choice(colors))
        for n in graph_frame.nodes()
    ],
    columns=["@id", "color"]
)

graph_frame.add_node_properties(colors, prop_type="category")

desc = pd.DataFrame(
    [
        (n, ' '.join(random.sample(words.words(), 20)))
        for n in graph_frame.nodes()
    ],
    columns=["@id", "desc"]
)

graph_frame.add_node_properties(desc, prop_type="text")

graph_frame.nodes(raw_frame=True).sample(5)

	@type	weight	color	desc
@id
13	Orange	40.386831	blue	cutterhead amanuenses Kashubian Alchornea skin...
8	Carrot	29.168627	blue	probe menorrhoeic hemicephalous comart gander ...
29	Apple	35.391697	blue	teruncius tetanoid unsovereign carpocarpal unr...
10	Apple	37.038171	green	balloter preceding scabies lengthways lotase o...
18	Carrot	32.094158	green	oiled sphericle relationism neostriatum molehi...

graph_frame._node_prop_types

{'@type': 'category', 'weight': 'numeric', 'color': 'category', 'desc': 'text'}

We add edge properties of different data types (numeric, categorical, text) randomly.

years = pd.DataFrame(
    [
        (s, t, np.random.randint(0, 20))
        for s, t in graph_frame.edges()
    ],
    columns=["@source_id", "@target_id", "n_years"]
)

graph_frame.add_edge_properties(years, prop_type="numeric")

shapes = ["dashed", "dotted", "solid"]
shapes = pd.DataFrame(
    [
        (s, t, np.random.choice(shapes))
        for s, t, in graph_frame.edges()
    ],
    columns=["@source_id", "@target_id", "shapes"]
)

graph_frame.add_edge_properties(shapes, prop_type="category")

desc = pd.DataFrame(
    [
        (s, t, ' '.join(random.sample(words.words(), 20)))
        for s, t, in graph_frame.edges()
    ],
    columns=["@source_id", "@target_id", "desc"]
)

graph_frame.add_edge_properties(desc, prop_type="text")

graph_frame.edges(raw_frame=True).sample(5)

		@type	n_years	shapes	desc
@source_id	@target_id
8	13	isFriend	14	dotted	preconize Berycidae shopmaid tanyard topi piac...
18	34	isFriend	4	dashed	Sterope undermusic lorn sorbefacient Sabbatize...
21	30	isFriend	12	dashed	octadic teleozoic elderberry confirm stigmario...
1	69	isFriend	10	solid	leptocephalia Anglist uncorresponding parafloc...
25	66	isEnemy	12	dashed	Iswara myodynamia barken black timoneer defloc...

graph_frame._edge_prop_types

{'@type': 'category',
 'n_years': 'numeric',
 'shapes': 'category',
 'desc': 'text'}

Perform semantic encoding of properties¶

BlueGraph allows to convert node/edge properties of different data types into numerical vectors.

NB: If nltk error occurs, run the following code (the ‘words’ corpus needs to be downloaded for semantic encoding of text properties):

import nltk
nltk.download('words')

Create a encoder object for homogeneous encoding (properties of all the nodes (edges) are encoded with feature vectors of the same length independently of their type).

hom_encoder = ScikitLearnPGEncoder(
    node_properties=["weight", "color", "desc"],
    edge_properties=["n_years", "shapes", "desc"],
    edge_features=True,
    heterogeneous=False,
    encode_types=True,
    drop_types=True,
    text_encoding="tfidf",
    standardize_numeric=True)

transformed_frame = hom_encoder.fit_transform(graph_frame)

transformed_frame.nodes(raw_frame=True).sample(5)

	features
@id
25	[-0.9693465349258025, 0.0, 1.0, 0.0, 0.0, 0.0,...
59	[1.0407324935966866, 0.0, 1.0, 0.0, 0.0, 0.0, ...
40	[0.22089544697164212, 0.0, 1.0, 0.0, 0.0, 0.0,...
12	[1.521323313308059, 0.0, 0.0, 1.0, 0.0, 0.0, 0...
62	[-1.2547871487822837, 0.0, 0.0, 1.0, 0.0, 0.0,...

We can inspect encoding models for different node and edge properties created by BlueGraph.

hom_encoder._node_encoders

{'weight': StandardScaler(),
 'color': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

transformed_frame.edges(raw_frame=True).sample(5)

		features
@source_id	@target_id
54	57	[-0.2198738883485877, 1.0, 0.0, 0.0, 0.0, 0.0,...
29	40	[-0.7501579720128285, 0.0, 1.0, 0.0, 0.0, 0.0,...
2	14	[0.48717155653706673, 0.0, 0.0, 1.0, 0.0, 0.0,...
15	49	[-1.6339647781198965, 0.0, 0.0, 1.0, 0.0, 0.0,...
18	33	[0.3104101953156531, 0.0, 0.0, 1.0, 0.0, 0.0, ...

hom_encoder._edge_encoders

{'n_years': StandardScaler(),
 'shapes': MultiLabelBinarizer(),
 'desc': TfidfVectorizer(max_features=128, stop_words='english', sublinear_tf=True)}

Convert PGFrames to JSON¶

json_repr = graph_frame.to_json()

json_repr["nodes"][:2]

[{'@id': 0,
  '@type': 'Apple',
  'weight': 36.53863443435658,
  'color': 'green',
  'desc': 'Trinitarian undyeable fearedness quinquelobated thermanalgesia unanimous branchful Septentrion deerherd mispleading timbern mechanal papaphobist rowanberry admeasurement disilicide yade undertake innoxiously epiphanous'},
 {'@id': 1,
  '@type': 'Orange',
  'weight': 37.24906812781439,
  'color': 'blue',
  'desc': 'orderer interpellator acouometer though unpoisonable delegation Yellowknife professorial forenotice computational subinternal weepable cliental microtelephone chandleress feroher falltime consociation theoleptic eustomatous'}]

json_repr["edges"][:2]

[{'@source_id': 0,
  '@target_id': 25,
  '@type': 'isFriend',
  'n_years': 0,
  'shapes': 'dotted',
  'desc': 'nonsetter noncontent xenelasia ozokerite speiss smithing unillumination stenographer unappeasedly bookling buttgenbachite saxhorn tideless pterygote pix topply spraint wherethrough largen seminebulous'},
 {'@source_id': 0,
  '@target_id': 33,
  '@type': 'isFriend',
  'n_years': 15,
  'shapes': 'dashed',
  'desc': 'traily scagliolist maintenance semipectoral cycloolefin pyovesiculosis reptatorial upsilon rotatodentate determiner marbler benzonitrol sandust cystolithectomy volatilization spiritistic micropterygid unegoistical Rosicrucianism meteorography'}]

Create a new PandasPGFrame from the generated representation.

new_frame = PandasPGFrame.from_json(json_repr)

new_frame.nodes(raw_frame=True).sample(5)

	@type	weight	color	desc
@id
40	Orange	36.165271	green	Mareotic dracontian tartrazine cholelithotomy ...
38	Apple	40.665344	red	ballet ensuer congressionalist unicellular Het...
28	Carrot	35.038295	green	salicorn outgrowing compensatory vorticism bah...
13	Orange	40.386831	blue	cutterhead amanuenses Kashubian Alchornea skin...
55	Orange	34.850857	green	overdrowsed uncommuted recital joyful oxidizab...