Pandas

Sun 29 June 2025

import pandas as pd

s = pd.Series([10, 20, 30])
print(s)

0    10
1    20
2    30
dtype: int64

s = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s)

a    10
b    20
c    30
dtype: int64

s = pd.Series({'a': 1, 'b': 2})
print(s)

a    1
b    2
dtype: int64

print(s['a'])

print(s[s > 1])

b    2
dtype: int64

s1 = pd.Series([1, 2])
s2 = pd.Series([3, 4])
print(s1 + s2)

0    4
1    6
dtype: int64

s = pd.Series([1, None, 3])
print(s.isnull())

0    False
1     True
2    False
dtype: bool

print(s.fillna(0))

0    1.0
1    0.0
2    3.0
dtype: float64

print(s.dropna())

0    1.0
2    3.0
dtype: float64

s.name = "my_series"
print(s.name)

my_series

df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
print(df)

   A  B
0  1  3
1  2  4

df = pd.DataFrame([[1, 2], [3, 4]], columns=['A', 'B'])
print(df)

   A  B
0  1  2
1  3  4

df = pd.DataFrame({'A': [1, 2]}, index=['x', 'y'])
print(df)

   A
x  1
y  2

s1 = pd.Series([1, 2], name='X')
s2 = pd.Series([3, 4], name='Y')
df = pd.concat([s1, s2], axis=1)
print(df)

   X  Y
0  1  3
1  2  4

print(pd.DataFrame())

Empty DataFrame
Columns: []
Index: []

df = pd.DataFrame({'A': [1, None], 'B': [None, 4]})
print(df)

     A    B
0  1.0  NaN
1  NaN  4.0

df['C'] = [5, 6]
print(df)

     A    B  C
0  1.0  NaN  5
1  NaN  4.0  6

del df['C']
print(df)

     A    B
0  1.0  NaN
1  NaN  4.0

df.insert(1, 'C', [9, 8])
print(df)

     A  C    B
0  1.0  9  NaN
1  NaN  8  4.0

df = df.rename(columns={'A': 'X'})
print(df)

     X  C    B
0  1.0  9  NaN
1  NaN  8  4.0

print(df.head())

     X  C    B
0  1.0  9  NaN
1  NaN  8  4.0

print(df.tail())

     X  C    B
0  1.0  9  NaN
1  NaN  8  4.0

print(df.shape)

(2, 3)

print(df.columns)

Index(['X', 'C', 'B'], dtype='object')

print(df.index)

RangeIndex(start=0, stop=2, step=1)

print(df.dtypes)

X    float64
C      int64
B    float64
dtype: object

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       1 non-null      float64
 1   C       2 non-null      int64  
 2   B       1 non-null      float64
dtypes: float64(2), int64(1)
memory usage: 180.0 bytes
None

print(df.describe())

         X         C    B
count  1.0  2.000000  1.0
mean   1.0  8.500000  4.0
std    NaN  0.707107  NaN
min    1.0  8.000000  4.0
25%    1.0  8.250000  4.0
50%    1.0  8.500000  4.0
75%    1.0  8.750000  4.0
max    1.0  9.000000  4.0

print(df.count())

X    1
C    2
B    1
dtype: int64

print(df.memory_usage())

Index    132
X         16
C         16
B         16
dtype: int64

print(df.memory_usage())

Index    132
X         16
C         16
B         16
dtype: int64

print(df[['X', 'B']])

     X    B
0  1.0  NaN
1  NaN  4.0

print(df.iloc[0])

X    1.0
C    9.0
B    NaN
Name: 0, dtype: float64

print(df[df['X'] > 1])

Empty DataFrame
Columns: [X, C, B]
Index: []

df = df.set_index('X')
print(df)

     C    B
X          
1.0  9  NaN
NaN  8  4.0

print(df.reset_index())

     X  C    B
0  1.0  9  NaN
1  NaN  8  4.0

print('X' in df.columns)

False

print(df.iat[0, 0])

df = pd.DataFrame({'A': [3, 1, 2]})
print(df.sort_values('A'))

print(df.sort_index())

print(df.sort_values('A', ascending=False))

df = pd.DataFrame({'A': [1, 1, 2], 'B': [3, 2, 1]})
print(df.sort_values(['A', 'B']))

s = pd.Series([3, 1, 2])
print(s.sort_values())

1    1
2    2
0    3
dtype: int64

print(s.argsort())

0    1
1    2
2    0
dtype: int64

print(s.rank())

0    3.0
1    1.0
2    2.0
dtype: float64

print(s.sort_index())

0    3
1    1
2    2
dtype: int64

print(s.nlargest(2))

0    3
2    2
dtype: int64

print(s.nsmallest(2))

1    1
2    2
dtype: int64

df = pd.DataFrame({'A': [1, None]})
print(df.isnull().any())

A    True
dtype: bool

print(df.notnull().all())

A    False
dtype: bool

print(df.fillna(df.mean()))

     A
0  1.0
1  1.0

print(df.fillna(df.median()))

     A
0  1.0
1  1.0

df = pd.DataFrame({'A': [1, None, 3]})
print(df.interpolate())

print(df.isnull().sum())

A    1
dtype: int64

df.fillna(0, inplace=True)
print(df)

df = pd.DataFrame({'A': [1, None], 'B': [2, 3]})
print(df[df.notnull().all(axis=1)])

     A  B
0  1.0  2

df = pd.DataFrame({'A': [1, 1, 2]})
print(df.drop_duplicates())

   A
0  1
2  2

print(df.drop_duplicates(keep='first'))

   A
0  1
2  2

df = pd.DataFrame({'A': ['1', '2']})
df['A'] = df['A'].astype(float)
print(df)

     A
0  1.0
1  2.0

df['A'] = df['A'].astype(int)
print(df)

   A
0  1
1  2

df['A'] = df['A'].astype(str)
print(df)

   A
0  1
1  2

df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
print(df.astype(float))

     A    B
0  1.0  3.0
1  2.0  4.0

print(df.apply(lambda x: x.astype(str)))

   A  B
0  1  3
1  2  4

df = pd.DataFrame({'date': ['2025-01-01', '2025-02-01']})
df['date'] = pd.to_datetime(df['date'])
print(df)

        date
0 2025-01-01
1 2025-02-01

print(df.dtypes)

date    datetime64[ns]
dtype: object

print(df['date'].tolist())

[Timestamp('2025-01-01 00:00:00'), Timestamp('2025-02-01 00:00:00')]

print(df.to_dict())

{'date': {0: Timestamp('2025-01-01 00:00:00'), 1: Timestamp('2025-02-01 00:00:00')}}

print(df.values)

[['2025-01-01T00:00:00.000000000']
 ['2025-02-01T00:00:00.000000000']]

df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': [1, 2, 3]})
grouped = df.groupby('A')
print(grouped.sum())

     B
A     
bar  3
foo  3

print(df.groupby('A').mean())

       B
A       
bar  3.0
foo  1.5

print(df.groupby('A').count())

     B
A     
bar  1
foo  2

print(df.groupby('A').agg(['sum', 'mean']))

      B     
    sum mean
A           
bar   3  3.0
foo   3  1.5

print(df.groupby('A')['B'].agg('max'))

A
bar    3
foo    2
Name: B, dtype: int64

df = pd.DataFrame({'A': ['foo', 'foo', 'bar'], 'B': ['x', 'y', 'x'], 'C': [1, 2, 3]})
print(df.groupby(['A', 'B']).sum())

       C
A   B   
bar x  3
foo x  1
    y  2

print(df.groupby('A')['C'].apply(lambda x: x.max() - x.min()))

A
bar    0
foo    1
Name: C, dtype: int64

print(df.groupby('A').sum().reset_index())

     A   B  C
0  bar   x  3
1  foo  xy  3

print(df.groupby('A').filter(lambda x: x['C'].sum() > 2))

     A  B  C
0  foo  x  1
1  foo  y  2
2  bar  x  3

print(df.groupby('A')['C'].transform('sum'))

0    3
1    3
2    3
Name: C, dtype: int64

df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})
print(df['A'].rolling(2).mean())

0    NaN
1    1.5
2    2.5
3    3.5
4    4.5
Name: A, dtype: float64

print(df['A'].rolling(3).sum())

0     NaN
1     NaN
2     6.0
3     9.0
4    12.0
Name: A, dtype: float64

print(df['A'].rolling(2).max())

0    NaN
1    2.0
2    3.0
3    4.0
4    5.0
Name: A, dtype: float64

print(df['A'].expanding().mean())

0    1.0
1    1.5
2    2.0
3    2.5
4    3.0
Name: A, dtype: float64

print(df['A'].expanding().sum())

0     1.0
1     3.0
2     6.0
3    10.0
4    15.0
Name: A, dtype: float64

print(df['A'].rolling(2).apply(lambda x: x.max() - x.min()))

0    NaN
1    1.0
2    1.0
3    1.0
4    1.0
Name: A, dtype: float64

print(df['A'].rolling(3, min_periods=1).mean())

0    1.0
1    1.5
2    2.0
3    3.0
4    4.0
Name: A, dtype: float64

print(df['A'].cumsum())

0     1
1     3
2     6
3    10
4    15
Name: A, dtype: int64

print(df['A'].cumprod())

0      1
1      2
2      6
3     24
4    120
Name: A, dtype: int64

print(df['A'].cummax())

0    1
1    2
2    3
3    4
4    5
Name: A, dtype: int64

s = pd.Series(['Hello', 'World'])
print(s.str.lower())

0    hello
1    world
dtype: object

print(s.str.upper())

0    HELLO
1    WORLD
dtype: object

print(s.str.len())

0    5
1    5
dtype: int64

print(s.str.contains('o'))

0    True
1    True
dtype: bool

print(s.str.replace('l', 'x'))

0    Hexxo
1    Worxd
dtype: object

print(s.str.startswith('H'))

0     True
1    False
dtype: bool

print(s.str.endswith('d'))

0    False
1     True
dtype: bool

s = pd.Series([' A ', ' B '])
print(s.str.strip())

0    A
1    B
dtype: object

s = pd.Series(['a,b,c', '1,2,3'])
print(s.str.split(','))

0    [a, b, c]
1    [1, 2, 3]
dtype: object

df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'A': [3, 4]})
print(pd.concat([df1, df2]))

df1 = pd.DataFrame({'key': ['A', 'B'], 'val1': [1, 2]})
df2 = pd.DataFrame({'key': ['A', 'B'], 'val2': [3, 4]})
print(pd.merge(df1, df2, on='key'))

  key  val1  val2
0   A     1     3
1   B     2     4

df1 = pd.DataFrame({'k1': ['A', 'B'], 'val1': [1, 2]})
df2 = pd.DataFrame({'k2': ['A', 'B'], 'val2': [3, 4]})
print(pd.merge(df1, df2, left_on='k1', right_on='k2'))

  k1  val1 k2  val2
0  A     1  A     3
1  B     2  B     4

print(pd.merge(df1, df2, left_on='k1', right_on='k2', how='outer'))

  k1  val1 k2  val2
0  A     1  A     3
1  B     2  B     4

print(pd.merge(df1, df2, left_on='k1', right_on='k2', how='inner'))

  k1  val1 k2  val2
0  A     1  A     3
1  B     2  B     4

df1 = pd.DataFrame({'val1': [1, 2]}, index=['A', 'B'])
df2 = pd.DataFrame({'val2': [3, 4]}, index=['A', 'B'])
print(df1.join(df2))

   val1  val2
A     1     3
B     2     4

df2 = df2.rename_axis('idx')
print(df1.join(df2, how='left'))

   val1  val2
A     1     3
B     2     4

print(pd.concat([df1, new_row]).reset_index())

  index  val1
0     A     1
1     B     2
2     C     5

df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df.to_csv('output.csv', index=False)

df.to_excel('output.xlsx', index=False)

df.to_json('output.json', orient='records')

df.to_pickle('data.pkl')

df_loaded = pd.read_pickle('data.pkl')
print(df_loaded)

   A  B
0  1  3
1  2  4

import matplotlib.pyplot as plt

df = pd.DataFrame({'x': range(5), 'y': [1, 2, 3, 4, 5]})
df.plot(x='x', y='y')
plt.show()

png

df.plot.bar(x='x', y='y')
plt.show()

png

df.plot.barh(x='x', y='y')
plt.show()

png

df['y'].plot.hist()
plt.show()

png

df.plot.box()
plt.show()

png

df.plot.area()
plt.show()

png

df.plot.scatter(x='x', y='y')
plt.show()

png

df = pd.DataFrame({'category': ['A', 'B', 'C'], 'count': [10, 20, 30]})
df.set_index('category').plot.pie(y='count', autopct='%1.1f%%')
plt.show()

png

df2 = pd.DataFrame({'a': range(5), 'b': [2, 3, 2, 4, 5]})
df2.plot()
plt.show()

png

arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
index = pd.MultiIndex.from_arrays(arrays, names=('letters', 'numbers'))
df = pd.DataFrame({'value': [10, 20, 30, 40]}, index=index)
print(df)

                 value
letters numbers       
A       1           10
        2           20
B       1           30
        2           40

import pandas as pd
import matplotlib.pyplot as plt

df = pd.DataFrame({
    'x': range(1, 6),
    'y1': [1, 3, 2, 5, 4],
    'y2': [2, 4, 1, 3, 5]
})

df.plot(x='x', y='y1')
plt.title("Line Plot")
plt.show()

png

df.plot(x='x', y=['y1', 'y2'])
plt.title("Multiple Line Plot")
plt.show()

png

df.plot.bar(x='x', y='y1')
plt.title("Bar Plot")
plt.show()

png

df.plot.barh(x='x', y='y1')
plt.title("Horizontal Bar Plot")
plt.show()

png

df.plot.bar(x='x', y=['y1', 'y2'], stacked=True)
plt.title("Stacked Bar Plot")
plt.show()

png

df.plot.area(x='x', y=['y1', 'y2'])
plt.title("Area Plot")
plt.show()

png

df.plot.area(x='x', y=['y1', 'y2'], stacked=True)
plt.title("Stacked Area Plot")
plt.show()

png

df.plot.scatter(x='x', y='y1')
plt.title("Scatter Plot")
plt.show()

png

df.plot.scatter(x='x', y='y1', c='y2', cmap='viridis')
plt.title("Colored Scatter Plot")
plt.show()

png

df['y1'].plot.hist(bins=5)
plt.title("Histogram")
plt.show()

png

df[['y1', 'y2']].plot.box()
plt.title("Box Plot")
plt.show()

png

df['y1'].plot.kde()
plt.title("KDE Plot")
plt.show()

png

df_sum = df[['y1', 'y2']].sum()
df_sum.plot.pie(autopct='%1.1f%%')
plt.title("Pie Chart")
plt.ylabel("")
plt.show()

png

df.plot(x='x', y='y1', marker='o')
plt.title("Line Plot with Markers")
plt.show()

png

df.plot(x='x', y='y1', style='--')
plt.title("Dashed Line Plot")
plt.show()

png

df.plot(x='x', y='y1', grid=True)
plt.title("Line Plot with Grid")
plt.show()

png

ax = df.plot(x='x', y='y1', color='blue', label='y1')
df.plot(x='x', y='y2', ax=ax, color='red', secondary_y=True, label='y2')
plt.title("Dual Y-Axis")
plt.show()

png

df.drop('x', axis=1).T.plot()
plt.title("Transposed Plot")
plt.show()

png

df.set_index('x').plot(subplots=True, layout=(2, 1), figsize=(6, 6))
plt.suptitle("Subplots")
plt.show()

png

df.plot(x='x', y='y1')
plt.title("Saved Plot")
plt.savefig("line_plot.png")
plt.close()

arrays = [['A', 'A', 'B', 'B'], [1, 2, 1, 2]]
index = pd.MultiIndex.from_arrays(arrays, names=('letters', 'numbers'))
df = pd.DataFrame({'value': [10, 20, 30, 40]}, index=index)
print(df)

                 value
letters numbers       
A       1           10
        2           20
B       1           30
        2           40

print(df.loc[('A', 1)])

value    10
Name: (A, 1), dtype: int64

print(df.loc['A'])

         value
numbers       
1           10
2           20

print(df.reset_index())

  letters  numbers  value
0       A        1     10
1       A        2     20
2       B        1     30
3       B        2     40

df2 = df.reset_index().set_index(['letters', 'numbers'])
print(df2)

                 value
letters numbers       
A       1           10
        2           20
B       1           30
        2           40

print(df.sort_index())

                 value
letters numbers       
A       1           10
        2           20
B       1           30
        2           40

print(df.swaplevel())

                 value
numbers letters       
1       A           10
2       A           20
1       B           30
2       B           40

print(df.index.names)

['letters', 'numbers']

print(df.index.get_level_values(0))

Index(['A', 'A', 'B', 'B'], dtype='object', name='letters')

print(df.groupby(level='letters').sum())

         value
letters       
A           30
B           70

df = pd.DataFrame({'A': range(1000)})
print(df.memory_usage(deep=True))

Index     132
A        8000
dtype: int64

df['A'] = df['A'].astype('int16')
print(df.dtypes)

A    int16
dtype: object

df['B'] = pd.Series([1.5]*1000).astype('float32')
print(df.dtypes)

A      int16
B    float32
dtype: object

df['C'] = pd.Series(['a']*500 + ['b']*500)
df['C'] = df['C'].astype('category')
print(df.dtypes)

A       int16
B     float32
C    category
dtype: object

df['A'] = df['A'] + 1

df.loc[0, 'A'] = 999

df['D'] = df['A'] * 2

import numpy as np
df['E'] = np.log1p(df['A'])

df_large = pd.DataFrame(np.zeros((10000, 10)))

dates = pd.date_range(start='2025-01-01', periods=5, freq='D')
print(dates)

DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame({'date': dates, 'value': [1, 2, 3, 4, 5]})
df.set_index('date', inplace=True)
print(df)

            value
date             
2025-01-01      1
2025-01-02      2
2025-01-03      3
2025-01-04      4
2025-01-05      5

print(df.rolling(2).mean())

            value
date             
2025-01-01    NaN
2025-01-02    1.5
2025-01-03    2.5
2025-01-04    3.5
2025-01-05    4.5

print(df.shift(1))

            value
date             
2025-01-01    NaN
2025-01-02    1.0
2025-01-03    2.0
2025-01-04    3.0
2025-01-05    4.0

print(df.pct_change())

               value
date                
2025-01-01       NaN
2025-01-02  1.000000
2025-01-03  0.500000
2025-01-04  0.333333
2025-01-05  0.250000

print(df.index.to_series().diff())

date
2025-01-01      NaT
2025-01-02   1 days
2025-01-03   1 days
2025-01-04   1 days
2025-01-05   1 days
Name: date, dtype: timedelta64[ns]

print(df.loc['2025-01-03'])

value    3
Name: 2025-01-03 00:00:00, dtype: int64

print(df['2025-01-02':'2025-01-04'])

            value
date             
2025-01-02      2
2025-01-03      3
2025-01-04      4

df = pd.DataFrame({'year': [2025]*3, 'month': [1, 2, 3], 'day': [1, 2, 3]})
df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
print(df)

   year  month  day       date
0  2025      1    1 2025-01-01
1  2025      2    2 2025-02-02
2  2025      3    3 2025-03-03

df = pd.DataFrame({'A': [1, 2], 'B': [[1, 2], [3, 4]]})
print(df.explode('B'))

df = pd.DataFrame({'id': [1], 'Jan': [10], 'Feb': [20]})
print(pd.melt(df, id_vars='id', var_name='month', value_name='sales'))

   id month  sales
0   1   Jan     10
1   1   Feb     20

df = pd.DataFrame({'id': [1, 1], 'month': ['Jan', 'Feb'], 'sales': [10, 20]})
print(df.pivot(index='id', columns='month', values='sales'))

month  Feb  Jan
id             
1       20   10

df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
print(df.stack())

0  A    1
   B    3
1  A    2
   B    4
dtype: int64

print(df.stack().unstack())

   A  B
0  1  3
1  2  4

arrays = [['A', 'A', 'B'], ['one', 'two', 'one']]
df = pd.DataFrame([[1, 2, 3]], columns=pd.MultiIndex.from_arrays(arrays))
print(df)

    A       B
  one two one
0   1   2   3

df.columns = ['_'.join(col) for col in df.columns.values]
print(df)

   A_one  A_two  B_one
0      1      2      3

df = pd.DataFrame({'id': [1, 2, 1], 'month': ['Jan', 'Jan', 'Feb'], 'val': [10, 20, 30]})
print(df.groupby(['id', 'month'])['val'].sum().unstack())

month   Feb   Jan
id               
1      30.0  10.0
2       NaN  20.0

df1 = pd.DataFrame({'time': pd.to_datetime(['2025-01-01', '2025-01-03']), 'val1': [1, 2]})
df2 = pd.DataFrame({'time': pd.to_datetime(['2025-01-02', '2025-01-04']), 'val2': [3, 4]})
print(pd.merge_asof(df1, df2, on='time'))

        time  val1  val2
0 2025-01-01     1   NaN
1 2025-01-03     2   3.0

df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': ['x', 'y']})
print(df1.merge(df2, how='cross'))

df1 = pd.DataFrame({'id': [1, 2]})
df2 = pd.DataFrame({'id': [2, 3]})
print(pd.merge(df1, df2, on='id', how='outer', indicator=True))

   id      _merge
0   1   left_only
1   2        both
2   3  right_only

print(pd.merge(df1, df2, on='id', how='outer', suffixes=('_left', '_right')))

df = pd.DataFrame({'A': [1, 2, 3]})
df['A'] = pd.to_numeric(df['A'], downcast='integer')
print(df.dtypes)

A    int8
dtype: object

df['A'] = df['A'].astype('float32')
print(df.dtypes)

A    float32
dtype: object

df['col'] = ['a', 'b', 'a']
df['col'] = df['col'].astype('category')
print(df.dtypes)

A       float32
col    category
dtype: object

print(df.memory_usage(deep=True))

Index    132
A         12
col      211
dtype: int64

df = pd.DataFrame({'int': range(1000), 'flt': [1.0]*1000, 'cat': ['x']*1000})
df['int'] = pd.to_numeric(df['int'], downcast='unsigned')
df['flt'] = pd.to_numeric(df['flt'], downcast='float')
df['cat'] = df['cat'].astype('category')
print(df.dtypes)

int      uint16
flt     float32
cat    category
dtype: object

df = pd.DataFrame({'val': [10, 20, 30, 1000]})
q_low = df['val'].quantile(0.05)
q_high = df['val'].quantile(0.95)
print(df[(df['val'] > q_low) & (df['val'] < q_high)])

   val
1   20
2   30

df['norm'] = (df['val'] - df['val'].mean()) / df['val'].std()
print(df)

    val      norm
0    10 -0.520336
1    20 -0.499931
2    30 -0.479525
3  1000  1.499792

df['scaled'] = (df['val'] - df['val'].min()) / (df['val'].max() - df['val'].min())
print(df)

    val      norm    scaled
0    10 -0.520336  0.000000
1    20 -0.499931  0.010101
2    30 -0.479525  0.020202
3  1000  1.499792  1.000000

df = pd.DataFrame({'id': [1, 1, 2]})
print(df.duplicated())

0    False
1     True
2    False
dtype: bool

print(df[df.duplicated()])

   id
1   1

df = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
print(df.apply(lambda row: row['a'] + row['b'], axis=1))
df = pd.DataFrame({'first': ['Alice', 'Bob'], 'last': ['Smith', 'Jones']})
df['full'] = df['first'] + ' ' + df['last']
print(df)

0    4
1    6
dtype: int64
   first   last         full
0  Alice  Smith  Alice Smith
1    Bob  Jones    Bob Jones

df = pd.DataFrame({'first': ['Alice', 'Bob'], 'last': ['Smith', 'Jones']})
df['full'] = df['first'] + ' ' + df['last']
print(df)

   first   last         full
0  Alice  Smith  Alice Smith
1    Bob  Jones    Bob Jones

df = pd.DataFrame({'score': [45, 67, 89, 91]})
df['grade'] = pd.cut(df['score'], bins=[0, 60, 80, 100], labels=['F', 'B', 'A'])
print(df)

   score grade
0     45     F
1     67     B
2     89     A
3     91     A

df = pd.DataFrame({'x': [1, 2, 3], 'y': [1, 4, 9]})
print(df.corr())

          x         y
x  1.000000  0.989743
y  0.989743  1.000000

df = pd.DataFrame({'col': ['a', 'b', 'a', 'c']})
print(df['col'].value_counts())

col
a    2
b    1
c    1
Name: count, dtype: int64

summary = {
    'cols': df.columns.tolist(),
    'dtypes': df.dtypes.tolist(),
    'nulls': df.isnull().sum().tolist()
}
print(pd.DataFrame(summary))

  cols  dtypes  nulls
0  col  object      0

Score: 200

Category: basics