Example analyzation and anonymization of sensitive dataset

[1]:
from pyarxaas import ARXaaS
from pyarxaas.privacy_models import KAnonymity, LDiversityDistinct
from pyarxaas import AttributeType
from pyarxaas import Dataset
import pandas as pd

Create ARXaaS connection

[2]:
arxaas = ARXaaS("http://localhost:8080/") # connecting to online service

fetch sensitive data

[3]:
data_df = pd.read_csv("../data/data2.csv", sep=";")
[4]:
data_df
[4]:
zipcode age salary disease
0 47677 29 3 gastric ulcer
1 47602 22 4 gastritis
2 47678 27 5 stomach cancer
3 47905 43 6 gastritis
4 47909 52 11 flu
5 47906 47 8 bronchitis
6 47605 30 7 bronchitis
7 47673 36 9 pneumonia
8 47607 32 10 stomach cancer

Create Dataset

[6]:
dataset = Dataset.from_pandas(data_df)

Set the AttributeType for the dataset fields

[7]:
dataset.set_attribute_type(AttributeType.IDENTIFYING, 'salary')

Set Generalization Hierarchies

Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.

[8]:
zipcode_hierarchy = pd.read_csv("../data/data2_zipcode_hierarchy.csv", sep=";", header=None)
age_hierarchy = pd.read_csv("../data/data2_age_hierarchy.csv", sep=";", header=None)
disease_hierarchy = pd.read_csv("../data/data2_disease_hierarchy.csv", sep=";", header=None)
[9]:
zipcode_hierarchy
[9]:
0 1 2 3 4 5
0 47677 4767* 476** 47*** 4**** *****
1 47602 4760* 476** 47*** 4**** *****
2 47678 4767* 476** 47*** 4**** *****
3 47905 4790* 479** 47*** 4**** *****
4 47909 4790* 479** 47*** 4**** *****
5 47906 4790* 479** 47*** 4**** *****
6 47605 4760* 476** 47*** 4**** *****
7 47673 4767* 476** 47*** 4**** *****
8 47607 4760* 476** 47*** 4**** *****
[10]:
dataset.set_hierarchy('age', age_hierarchy)
dataset.set_hierarchy("zipcode", zipcode_hierarchy)
dataset.set_hierarchy("disease", disease_hierarchy)

Create Privacy Models

[11]:
kanon = KAnonymity(4)

Create Risk Profile

[13]:
risk_profile = arxaas.risk_profile(dataset)
[14]:
risk_profile.re_identification_risk
[14]:
{'estimated_journalist_risk': 1.0,
 'records_affected_by_highest_prosecutor_risk': 1.0,
 'sample_uniques': 1.0,
 'lowest_risk': 1.0,
 'estimated_prosecutor_risk': 1.0,
 'highest_journalist_risk': 1.0,
 'records_affected_by_lowest_risk': 1.0,
 'average_prosecutor_risk': 1.0,
 'estimated_marketer_risk': 1.0,
 'highest_prosecutor_risk': 1.0,
 'records_affected_by_highest_journalist_risk': 1.0,
 'population_uniques': 1.0}
[15]:
risk_profile.distribution_of_risk_dataframe().head()
[15]:
interval recordsWithMaxmalRiskWithinInterval recordsWithRiskWithinInteval
0 ]50,100] 1.0 1.0
1 ]33.4,50] 0.0 0.0
2 ]25,33.4] 0.0 0.0
3 ]20,25] 0.0 0.0
4 ]16.7,20] 0.0 0.0
[ ]:

Anonymize

[17]:
anon_result = arxaas.anonymize(dataset, [kanon])
[18]:
anon_result.dataset.to_dataframe()
[18]:
zipcode age salary disease
0 47*** * * stomach disease
1 47*** * * stomach disease
2 47*** * * stomach disease
3 47*** * * stomach disease
4 47*** * * respiratory infection
5 47*** * * respiratory infection
6 47*** * * respiratory infection
7 47*** * * respiratory infection
8 47*** * * stomach disease

Anonymization Status

Anonymization status describes if ARXaaS was able to anonymize the dataset to comply with the provided Privacy Models.

[19]:
anon_result.anonymization_status
[19]:
'ANONYMOUS'

RiskProfile for the anonymized dataset

[20]:
anon_rp = anon_result.risk_profile
[21]:
anon_rp.re_identification_risk
[21]:
{'estimated_journalist_risk': 0.25,
 'records_affected_by_highest_prosecutor_risk': 0.4444444444444444,
 'sample_uniques': 0.0,
 'lowest_risk': 0.2,
 'estimated_prosecutor_risk': 0.25,
 'highest_journalist_risk': 0.25,
 'records_affected_by_lowest_risk': 0.5555555555555556,
 'average_prosecutor_risk': 0.2222222222222222,
 'estimated_marketer_risk': 0.2222222222222222,
 'highest_prosecutor_risk': 0.25,
 'records_affected_by_highest_journalist_risk': 0.4444444444444444,
 'population_uniques': 0.0}
[22]:
anon_rp.distribution_of_risk_dataframe().head(10)
[22]:
interval recordsWithMaxmalRiskWithinInterval recordsWithRiskWithinInteval
0 ]50,100] 1.000000 0.000000
1 ]33.4,50] 1.000000 0.000000
2 ]25,33.4] 1.000000 0.000000
3 ]20,25] 1.000000 0.444444
4 ]16.7,20] 0.555556 0.555556
5 ]14.3,16.7] 0.000000 0.000000
6 ]12.5,14.3] 0.000000 0.000000
7 ]10,12.5] 0.000000 0.000000
8 ]9,10] 0.000000 0.000000
9 ]8,9] 0.000000 0.000000
[ ]: