User Tools

Site Tools


data_manipulation
  • List unique values
# across the dataframe
pd.unique(df.column_name.ravel())
 
# on a single column
df.column.unique().ravel()

* Duplicate values

dupes = df[df.duplicated(['col1', 'col2', 'col3'], keep=False)]
  • Convert a column to numeric
df['col'].astype(str).convert_objects(convert_numeric=True)
  • Select df rows that contain specific values, stored in a list
valuelist = ['value1', 'value2', 'value3']
df = df[df.column.isin(valuelist)]
  • Delete column from DataFrame
del df['column']
  • Select from DataFrame using criteria from multiple columns
newdf = df[(df['column_one']>2004) & (df['column_two']==9)]
  • Rename several DataFrame columns
# method 1
df = df.rename(columns = {
    'col1 old name':'col1 new name',
    'col2 old name':'col2 new name',
    'col3 old name':'col3 new name',
})
 
# method 2
old_names = ['name1',
             'name2',
             'name3'] 
 
new_names = ['NAME1',
             'NAME2',
             'NAME3'] 
 
df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
  • Lower casing - a lot of tricks here
# method 1
df.columns = map(str.lower, df.columns)
 
# method 2
df.column.str.lower()
  • Loop through rows in a DataFrame
# (if you must) - but anyway this slows down the shit out of the process
for index, row in df.iterrows():
    print index, row['some column']
  • Slice values in a DataFrame column (aka Series)
df.column.str[0:2]
  • Sort (now sort_values() must be used instead)
df = df.sort(['col1','col2','col3'],ascending=[1,1,0])
  • Change all NaNs to None
# useful before loading to a db
df = df.where((pd.notnull(df)), None)
  • Get a quick count of rows in a df.
len(df.index)
 
#even quicker, I can use:
 
df.shape

* Get rid of non-numeric values in a DataFrame

for col in refunds.columns.values:
  refunds[col] = refunds[col].replace('[^0-9]+.-', '', regex=True)

* Set DataFrame column values based on other column values

df.loc[(df['column1'] == some_value) & (df['column2'] == some_other_value), ['column_to_change']] = new_value
data_manipulation.txt · Last modified: 2016/10/07 14:30 by vincenzo