Example analyzation and anonymization of sensitive dataset¶
[1]:
from pyarxaas import ARXaaS
from pyarxaas.privacy_models import KAnonymity, LDiversityDistinct
from pyarxaas import AttributeType
from pyarxaas import Dataset
import pandas as pd
Create ARXaaS connection¶
[2]:
arxaas = ARXaaS("http://localhost:8080/") # connecting to online service
fetch sensitive data¶
[3]:
data_df = pd.read_csv("../data/data2.csv", sep=";")
[4]:
data_df
[4]:
zipcode | age | salary | disease | |
---|---|---|---|---|
0 | 47677 | 29 | 3 | gastric ulcer |
1 | 47602 | 22 | 4 | gastritis |
2 | 47678 | 27 | 5 | stomach cancer |
3 | 47905 | 43 | 6 | gastritis |
4 | 47909 | 52 | 11 | flu |
5 | 47906 | 47 | 8 | bronchitis |
6 | 47605 | 30 | 7 | bronchitis |
7 | 47673 | 36 | 9 | pneumonia |
8 | 47607 | 32 | 10 | stomach cancer |
Create Dataset¶
[6]:
dataset = Dataset.from_pandas(data_df)
Set the AttributeType for the dataset fields¶
[7]:
dataset.set_attribute_type(AttributeType.IDENTIFYING, 'salary')
Set Generalization Hierarchies¶
Note that if the hierarchy does not have a header row in the csv file, please set header=None in read_csv() or the first row will be interpreted as a header and ARXaaS will throw an exception for the missing hierarchy data.
[8]:
zipcode_hierarchy = pd.read_csv("../data/data2_zipcode_hierarchy.csv", sep=";", header=None)
age_hierarchy = pd.read_csv("../data/data2_age_hierarchy.csv", sep=";", header=None)
disease_hierarchy = pd.read_csv("../data/data2_disease_hierarchy.csv", sep=";", header=None)
[9]:
zipcode_hierarchy
[9]:
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 47677 | 4767* | 476** | 47*** | 4**** | ***** |
1 | 47602 | 4760* | 476** | 47*** | 4**** | ***** |
2 | 47678 | 4767* | 476** | 47*** | 4**** | ***** |
3 | 47905 | 4790* | 479** | 47*** | 4**** | ***** |
4 | 47909 | 4790* | 479** | 47*** | 4**** | ***** |
5 | 47906 | 4790* | 479** | 47*** | 4**** | ***** |
6 | 47605 | 4760* | 476** | 47*** | 4**** | ***** |
7 | 47673 | 4767* | 476** | 47*** | 4**** | ***** |
8 | 47607 | 4760* | 476** | 47*** | 4**** | ***** |
[10]:
dataset.set_hierarchy('age', age_hierarchy)
dataset.set_hierarchy("zipcode", zipcode_hierarchy)
dataset.set_hierarchy("disease", disease_hierarchy)
Create Privacy Models¶
[11]:
kanon = KAnonymity(4)
Create Risk Profile¶
[13]:
risk_profile = arxaas.risk_profile(dataset)
[14]:
risk_profile.re_identification_risk
[14]:
{'estimated_journalist_risk': 1.0,
'records_affected_by_highest_prosecutor_risk': 1.0,
'sample_uniques': 1.0,
'lowest_risk': 1.0,
'estimated_prosecutor_risk': 1.0,
'highest_journalist_risk': 1.0,
'records_affected_by_lowest_risk': 1.0,
'average_prosecutor_risk': 1.0,
'estimated_marketer_risk': 1.0,
'highest_prosecutor_risk': 1.0,
'records_affected_by_highest_journalist_risk': 1.0,
'population_uniques': 1.0}
[15]:
risk_profile.distribution_of_risk_dataframe().head()
[15]:
interval | recordsWithMaxmalRiskWithinInterval | recordsWithRiskWithinInteval | |
---|---|---|---|
0 | ]50,100] | 1.0 | 1.0 |
1 | ]33.4,50] | 0.0 | 0.0 |
2 | ]25,33.4] | 0.0 | 0.0 |
3 | ]20,25] | 0.0 | 0.0 |
4 | ]16.7,20] | 0.0 | 0.0 |
[ ]:
Anonymize¶
[17]:
anon_result = arxaas.anonymize(dataset, [kanon])
[18]:
anon_result.dataset.to_dataframe()
[18]:
zipcode | age | salary | disease | |
---|---|---|---|---|
0 | 47*** | * | * | stomach disease |
1 | 47*** | * | * | stomach disease |
2 | 47*** | * | * | stomach disease |
3 | 47*** | * | * | stomach disease |
4 | 47*** | * | * | respiratory infection |
5 | 47*** | * | * | respiratory infection |
6 | 47*** | * | * | respiratory infection |
7 | 47*** | * | * | respiratory infection |
8 | 47*** | * | * | stomach disease |
Anonymization Status¶
Anonymization status describes if ARXaaS was able to anonymize the dataset to comply with the provided Privacy Models.
[19]:
anon_result.anonymization_status
[19]:
'ANONYMOUS'
RiskProfile for the anonymized dataset¶
[20]:
anon_rp = anon_result.risk_profile
[21]:
anon_rp.re_identification_risk
[21]:
{'estimated_journalist_risk': 0.25,
'records_affected_by_highest_prosecutor_risk': 0.4444444444444444,
'sample_uniques': 0.0,
'lowest_risk': 0.2,
'estimated_prosecutor_risk': 0.25,
'highest_journalist_risk': 0.25,
'records_affected_by_lowest_risk': 0.5555555555555556,
'average_prosecutor_risk': 0.2222222222222222,
'estimated_marketer_risk': 0.2222222222222222,
'highest_prosecutor_risk': 0.25,
'records_affected_by_highest_journalist_risk': 0.4444444444444444,
'population_uniques': 0.0}
[22]:
anon_rp.distribution_of_risk_dataframe().head(10)
[22]:
interval | recordsWithMaxmalRiskWithinInterval | recordsWithRiskWithinInteval | |
---|---|---|---|
0 | ]50,100] | 1.000000 | 0.000000 |
1 | ]33.4,50] | 1.000000 | 0.000000 |
2 | ]25,33.4] | 1.000000 | 0.000000 |
3 | ]20,25] | 1.000000 | 0.444444 |
4 | ]16.7,20] | 0.555556 | 0.555556 |
5 | ]14.3,16.7] | 0.000000 | 0.000000 |
6 | ]12.5,14.3] | 0.000000 | 0.000000 |
7 | ]10,12.5] | 0.000000 | 0.000000 |
8 | ]9,10] | 0.000000 | 0.000000 |
9 | ]8,9] | 0.000000 | 0.000000 |
[ ]: