Census

Sun 29 June 2025
import pandas as pd

# Publicly available dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Define column names (from UCI description)
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

# Read CSV with defined parameters
census = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)

# Preview
print(census.head())
   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0              13  United-States  <=50K  
2             0             0              40  United-States  <=50K  
3             0             0              40  United-States  <=50K  
4             0             0              40           Cuba  <=50K
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
census = pd.read_csv(url, names=columns, na_values=" ?", skipinitialspace=True)
census.head()
census['income'].unique()
age workclass fnlwgt education education-num marital-status occupation relationship race sex capital-gain capital-loss hours-per-week native-country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K
census['income'].unique()
array(['<=50K', '>50K'], dtype=object)
# Fix the income bracket
def label_fix(label):
    label = label.strip().lower()
    if(label == '<=50k'):
        return 0
    return 1
census['income'] = census['income'].apply(label_fix)
census.rename(columns={"income": "income_bracket"}, inplace=True)
census['income_bracket'].unique()
array([0, 1])
# Train Test Split
from sklearn.model_selection import train_test_split
x_data = census.drop('income_bracket', axis=1)
y_labels = census['income_bracket']

X_train, x_test, y_train, y_est = train_test_split(x_data, y_labels, test_size=0.3, random_state=101)
census.columns
Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income_bracket'],
      dtype='object')
import tensorflow as tf
from tensorflow.feature_column import categorical_column_with_vocabulary_list as vlist
from tensorflow.keras.layers import StringLookup
gender_lookup = StringLookup(vocabulary=["Female", "Male"], output_mode="int")
gender
VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0)
from tensorflow.feature_column import categorical_column_with_hash_bucket as bucket
occupation = bucket("occupation", hash_bucket_size=1000)
marital_status = bucket("marital_status", hash_bucket_size=1000)
relationship = bucket("relationship", hash_bucket_size=1000)
education = bucket("education", hash_bucket_size=1000)
workclass = bucket("workclass", hash_bucket_size=1000)
native_country = bucket("native_country", hash_bucket_size=1000)
from tensorflow.feature_column import numeric_column as nc
age = nc("age")
education_num = nc("education_num")
capital_gain = nc("capital_gain")
capital_loss = nc("capital_loss")
hours_per_week = nc("hours_per_week")
feat_cols = [gender, occupation, marital_status, relationship, education, workclass, native_country, age, education_num, 
            capital_gain, capital_loss, hours_per_week]
feat_cols
[VocabularyListCategoricalColumn(key='gender', vocabulary_list=('Female', 'Male'), dtype=tf.string, default_value=-1, num_oov_buckets=0),
 HashedCategoricalColumn(key='occupation', hash_bucket_size=1000, dtype=tf.string),
 HashedCategoricalColumn(key='marital_status', hash_bucket_size=1000, dtype=tf.string),
 HashedCategoricalColumn(key='relationship', hash_bucket_size=1000, dtype=tf.string),
 HashedCategoricalColumn(key='education', hash_bucket_size=1000, dtype=tf.string),
 HashedCategoricalColumn(key='workclass', hash_bucket_size=1000, dtype=tf.string),
 HashedCategoricalColumn(key='native_country', hash_bucket_size=1000, dtype=tf.string),
 NumericColumn(key='age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='education_num', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='capital_gain', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='capital_loss', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='hours_per_week', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]


Score: 20

Category: basics