User Tools

Site Tools

  • List unique values
# across the dataframe
# on a single column

* Duplicate values

dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)]
  • Convert a column to numeric
  • Select df rows that contain specific values, stored in a list
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]
  • Delete column from DataFrame
del df['column']
  • Select from DataFrame using criteria from multiple columns
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
  • Rename several DataFrame columns
# method 1
df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
# method 2
old_names = ['name1',
new_names = ['NAME1',
df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
  • Lower casing - a lot of tricks here
# method 1
df.columns = map(str.lower, df.columns)
# method 2
  • Loop through rows in a DataFrame
# (if you must) - but anyway this slows down the shit out of the process
for index, row in df.iterrows():
    print index, row['some column']
  • Slice values in a DataFrame column (aka Series)
  • Sort (now sort_values() must be used instead)
df = df.sort(['col1','col2','col3'],ascending=[1,1,0])
  • Change all NaNs to None
# useful before loading to a db
df = df.where((pd.notnull(df)), None)
  • Get a quick count of rows in a df.
#even quicker, I can use:

* Get rid of non-numeric values in a DataFrame

for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

* Set DataFrame column values based on other column values

df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value
data_manipulation.txt · Last modified: 2016/10/07 14:30 by vincenzo