Census
Sun 29 June 2025
import pandas as pd
# Publicly available dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
# Define column names (from UCI description)
columns = [
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", "income"
]
# Read CSV with defined parameters
census = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)
# Preview
print(census.head())
age workclass fnlwgt education education-num \
0 39 State-gov 77516 Bachelors 13
1 50 Self-emp-not-inc 83311 Bachelors 13
2 38 Private 215646 HS-grad 9
3 53 Private 234721 11th 7
4 28 Private 338409 Bachelors 13
marital-status occupation relationship race sex \
0 Never-married Adm-clerical Not-in-family White Male
1 Married-civ-spouse Exec-managerial Husband White Male
2 Divorced Handlers-cleaners Not-in-family White Male
3 Married-civ-spouse Handlers-cleaners Husband Black Male
4 Married-civ-spouse Prof-specialty Wife Black Female
capital-gain capital-loss hours-per-week native-country income
0 2174 0 40 United-States <=50K
1 0 0 13 United-States <=50K
2 0 0 40 United-States <=50K
3 0 0 40 United-States <=50K
4 0 0 40 Cuba <=50K
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", "income"
]
census = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)
census.head()
census['income'].unique()
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
census['income'].unique()
array(['<=50K', '>50K'], dtype=object)
# Fix the income bracket
def label_fix(label):
label = label.strip().lower()
if(label == '<=50k'):
return 0
return 1
census['income'] = census['income'].apply(label_fix)
census.rename(columns={"income": "income_bracket"}, inplace=True)
census['income_bracket'].unique()
array([0, 1])
# Train Test Split
from sklearn.model_selection import train_test_split
x_data = census.drop('income_bracket', axis=1)
y_labels = census['income_bracket']
X_train, x_test, y_train, y_est = train_test_split(x_data, y_labels, test_size=0.3, random_state=101)
census.columns
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'income_bracket'],
dtype='object')
import tensorflow as tf
from tensorflow.feature_column import categorical_column_with_vocabulary_list as vlist
from tensorflow.keras.layers import StringLookup
gender_lookup = StringLookup(vocabulary=["Female", "Male"], output_mode="int")
gender
VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
from tensorflow.feature_column import categorical_column_with_hash_bucket as bucket
occupation = bucket("occupation", hash_bucket_size=1000)
marital_status = bucket("marital_status", hash_bucket_size=1000)
relationship = bucket("relationship", hash_bucket_size=1000)
education = bucket("education", hash_bucket_size=1000)
workclass = bucket("workclass", hash_bucket_size=1000)
native_country = bucket("native_country", hash_bucket_size=1000)
from tensorflow.feature_column import numeric_column as nc
age = nc("age")
education_num = nc("education_num")
capital_gain = nc("capital_gain")
capital_loss = nc("capital_loss")
hours_per_week = nc("hours_per_week")
feat_cols = [gender, occupation, marital_status, relationship, education, workclass, native_country, age, education_num,
capital_gain, capital_loss, hours_per_week]
feat_cols
[VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
HashedCategoricalColumn(key='occupation', hash_bucket_size=1000, dtype=tf.string),
HashedCategoricalColumn(key='marital_status', hash_bucket_size=1000, dtype=tf.string),
HashedCategoricalColumn(key='relationship', hash_bucket_size=1000, dtype=tf.string),
HashedCategoricalColumn(key='education', hash_bucket_size=1000, dtype=tf.string),
HashedCategoricalColumn(key='workclass', hash_bucket_size=1000, dtype=tf.string),
HashedCategoricalColumn(key='native_country', hash_bucket_size=1000, dtype=tf.string),
NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
Score: 20
Category: basics