【Pandas】
Series DataFrame
Series
・Seriesをつくる
DataFrame
・DataFrameをつくる
・属性など
・型変換
・いろいろなメソッド
・インデックス・カラム名
・取得関連
・削除関連
・apply
・欠損関連
・データの連結
・データの整形
・Groupby
・処理速度
・プロット
・保存・読み出し
・表示
Series DataFrame
Series
Seriesをつくる
・Seriesをつくる
print("import pandas as pd")
import pandas as pd
print("s = pd.Series(range(10))")
s = pd.Series(range(10))
print("s.values")
print(s.values)
print('==================================================')
print('インデックス')
print("s = pd.Series(data=[range(10)], index=['s'])")
s = pd.Series(data=[range(10)], index=['s'])
print("print(s)")
print(s)
print('==================================================')
print("s = pd.Series({'s': range(10)})")
s = pd.Series({'s': range(10)})
print("print(s)")
print(s)
print('==================================================')
print('関数')
print("s = pd.Series([len, print, sum, lambda x: x**2])")
s = pd.Series([len, print, sum, lambda x: x**2])
print("print(s)")
print(s)
Seriesに戻る
DataFrame
DataFrameをつくる 属性など 型変換 いろいろなメソッド インデックス・カラム名 取得関連 削除関連 apply 欠損関連 データの連結 データの整形 Groupby 計算速度 プロット 保存・読み出し 表示
・DataFrameをつくる
print("import numpy as np")
import numpy as np
print("import pandas as pd")
import pandas as pd
print("df = pd.DataFrame()")
df = pd.DataFrame()
print("print(df)")
print(df)
print('====================================================')
print("df = pd.DataFrame(data={'a': [0,1,2], 'b':[3,4,5]})")
df = pd.DataFrame(data={'a': [0,1,2], 'b':[3,4,5]})
print("print(df)")
print(df)
print('====================================================')
print("df = pd.DataFrame(np.arange(9).reshape(3,3), ")
print(" columns=['a','b','c'])")
df = pd.DataFrame(np.arange(9).reshape(3,3),
columns=['a','b','c'])
print("print(df)")
print(df)
DataFrameに戻る
・属性など
print("import pandas as pd")
import pandas as pd
print("df = pd.DataFrame(data={'a': [0,1,2,3,4], 'b':[5,6,7,8,9]})")
df = pd.DataFrame(data={'a': [0,1,2,3,4], 'b':[5,6,7,8,9]})
print("print(df)")
print(df)
print('==============================================================')
print("df['a'].dtype")
print(df['a'].dtype)
print('==============================================================')
print("df.shape")
print(df.shape)
print('==============================================================')
print("df.size")
print(df.size)
print('==============================================================')
print("df.index")
print(df.index)
print('==============================================================')
print("df.values")
print(df.values)
print('==============================================================')
print('スライス')
print('行名、列名')
print("df.loc[:3]")
print(df.loc[:3])
print('==============================================================')
print('インデックス')
print("df.iloc[:3]")
print(df.iloc[:3])
DataFrameに戻る
・型変換
print('import numpy as np')
import numpy as np
print('import pandas as pd')
import pandas as pd
print("df = pd.DataFrame(np.arange(9).reshape(3,3), ")
print(" columns=['a','b','c'])")
df = pd.DataFrame(np.arange(9).reshape(3,3),
columns=['a','b','c'])
print("print(df)")
print(df)
print("print(df.dtypes)")
print(df.dtypes)
print('===============================================')
print('str型に変換')
print("df = df.astype(str)")
df = df.astype(str)
print("print(df.dtypes)")
print(df.dtypes)
print('===============================================')
print('float型に変換')
print("df = df.astype(float)")
df = df.astype(float)
print("print(df.dtypes)")
print(df.dtypes)
print('===============================================')
print('int型に変換')
print("df = df.astype(int)")
df = df.astype(int)
print("print(df.dtypes)")
print(df.dtypes)
print('===============================================')
print('category型に変換')
print("df = df.astype('category')")
df = df.astype('category')
print("print(df.dtypes)")
print(df.dtypes)
print('===============================================')
print('apply, to_numeric')
print("df = df.apply(pd.to_numeric)")
df = df.apply(pd.to_numeric)
print("print(df.dtypes)")
print(df.dtypes)
DataFrameに戻る
・いろいろなメソッド
print('import numpy as np')
import numpy as np
print('import pandas as pd')
import pandas as pd
print('from sklearn.datasets import load_iris')
from sklearn.datasets import load_iris
print()
print('【データ作成】')
print('data = load_iris()')
data = load_iris()
print("columns = [name.split(' (')[0] for name in data.feature_names]")
columns = [name.split(' (')[0] for name in data.feature_names]
print("df = pd.DataFrame(data.data, columns=columns)")
df = pd.DataFrame(data.data, columns=columns)
print('=======================================================================')
print('【info(欠損・データ型など)】')
print("df.info()\n")
df.info()
print('=======================================================================')
print('【describe(基本統計量)】')
print("print(df.describe())\n")
print(df.describe())
print('=======================================================================')
print('【count】')
print("print(df.count())\n")
print(df.count())
print('=======================================================================')
print('【sum】')
print("print(df.sum())\n")
print(df.sum())
print('=======================================================================')
print('【mean】')
print("print(df.mean())\n")
print(df.mean())
print('=======================================================================')
print('【median】')
print("print(df.median())\n")
print(df.median())
print('=======================================================================')
print('【mode(最頻値)】')
print("print(df.mode())\n")
print(df.mode())
print('=======================================================================')
print('【min関連】')
print("print(df.idxmin())\n")
print(df.idxmin())
print('=======================================================================')
print("print(df.min())\n")
print(df.min())
print('=======================================================================')
print('【max関連】')
print("print(df.idxmax())\n")
print(df.idxmax())
print('=======================================================================')
print("print(df.max())\n")
print(df.max())
print('=======================================================================')
print('【std】')
print("print(df.std())\n")
print(df.std())
print('=======================================================================')
print('【var】')
print("print(df.var())\n")
print(df.var())
print('=======================================================================')
print('【対数・平方根など(Numpy)】')
print('対数')
print("print(np.log(df).head())\n")
print(np.log(df).head())
print('=======================================================================')
print('対数を戻す')
print("print(np.exp(df).head())\n")
print(np.exp(df).head())
print('=======================================================================')
print('平方根')
print("print(np.sqrt(df).head())\n")
print(np.sqrt(df).head())
print('=======================================================================')
print('【quantile(分位数)】')
print("print(df.quantile([0.25, 0.5, 0.75]))\n")
print(df.quantile([0.25, 0.5, 0.75]))
print('=======================================================================')
print('【cov(共分散)】')
print("print(df.cov())\n")
print(df.cov())
print('=======================================================================')
print('【corr(相関係数-1〜1,-1:負相関,0:相関なし,1:正相関)】')
print("print(df.corr())\n")
print(df.corr())
print('=======================================================================')
print('【unique関連】')
print("print(df['sepal length'].nunique())\n")
print(df['sepal length'].nunique())
print('=======================================================================')
print("print(df['sepal length'].unique()[:10])\n")
print(df['sepal length'].unique()[:10])
print('=======================================================================')
print("print(df['sepal length'].value_counts().head())\n")
print(df['sepal length'].value_counts().head())
print('=======================================================================')
print('【sort_values】')
print("print(df.sort_values('sepal_length', ascending=True).head())\n")
print(df.sort_values('sepal length', ascending=True).head())
print('=======================================================================')
print('【values(値をnp.arrayで取得)】')
print("print(df.values[:5])\n")
print(df.values[:5])
print('=======================================================================')
print('【diff(差分をとる)】')
print("print(df.diff(1).head())\n")
print(df.diff(1).head())
print('=======================================================================')
print("print(df - df.shift(1).head())\n")
print(df - df.shift(1).head())
print('=======================================================================')
print('【pct_change(変化率をとる)】')
print("print(df.pct_change(1).head())\n")
print(df.pct_change(1).head())
DataFrameに戻る
・インデックス・カラム名
print("import pandas as pd")
import pandas as pd
print('引数で指定')
print("df = pd.DataFrame(np.arange(9).reshape(3,3), ")
print(" index=['a','b','c'], ")
print(" columns=['col1','col2','col3'])")
df = pd.DataFrame(np.arange(9).reshape(3,3),
index=['a','b','c'],
columns=['col1','col2','col3'])
print("print(df)")
print(df)
print('=============================================================')
print("カラム名を変更する")
print("df.columns = ['c1', 'c2', 'c3']")
df.columns = ['c1', 'c2', 'c3']
print("print(df)")
print(df)
print('=============================================================')
print("df = df.rename(columns={'c1':'c_1', 'c2':'c_2', 'c3':'c_3'})")
df = df.rename(columns={'c1':'c_1', 'c2':'c_2', 'c3':'c_3'})
print("print(df)")
print(df)
print('=============================================================')
print('順序変える')
print("df = df.iloc[:, [2, 1, 0]]")
df = df.iloc[:, [2, 1, 0]]
print("print(df)")
print(df)
print('=============================================================')
print('インデックス名を変更する')
print("df.index = ['x', 'y', 'z']")
df.index = ['x', 'y', 'z']
print("df.index.name = 'index'")
df.index.name = 'index'
print("print(df)")
print(df)
print('=============================================================')
print("df = df.rename(index={'x':'001', 'y':'002', 'z':'003'})")
df = df.rename(index={'x':'001', 'y':'002', 'z':'003'})
print("print(df)")
print(df)
print('=============================================================')
print('インデックスをリセットする')
print("df.reset_index(drop=True, inplace=True)")
df.reset_index(drop=True, inplace=True)
print("print(df)")
print(df)
print('=============================================================')
print('インデックスカラムを変える')
print("df.set_index('c_1', drop=True, inplace=True)")
df.set_index('c_1', drop=True, inplace=True)
print("print(df)")
print(df)
DataFrameに戻る
・取得関連
基本的な取得
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(9).reshape(3,3),
index=['a','b','c'],
columns=['col1','col2','col3'])
print("print(df)")
print(df)
print('==================================================')
print('ラベルで取得')
print("df[['col1', 'col2']]")
print(df[['col1', 'col2']])
print('==================================================')
print("df.loc['b':, 'col2':]")
print(df.loc['b':, 'col2':])
print('==================================================')
print("df.loc['b', 'col2']")
print(df.loc['b', 'col2'])
print('==================================================')
print("df.at['a', 'col1']")
print(df.at['a', 'col1'])
print('==================================================')
print('インデックスで取得')
print("df.iloc[1:, 1:]")
print(df.iloc[1:, 1:])
print('==================================================')
print("df.iloc[1, 1]")
print(df.iloc[1, 1])
print('==================================================')
print("df.iat[0, 0]")
print(df.iat[0, 0])
print('==================================================')
print('特定要素のインデックスを取得')
print("df[df['col1']==3].index[0]")
print(df[df['col1']==3].index[0])
print('==================================================')
print('条件指定で取得')
print("df[df.col3 > 5]")
print(df[df.col3 > 5])
print('==================================================')
print("df.loc[[True, False, True], [True, False, True]]")
print(df.loc[[True, False, True], [True, False, True]])
print('==================================================')
print("df.loc[[True, False, True], ['col1', 'col2']]")
print(df.loc[[True, False, True], ['col1', 'col2']])
print('==================================================')
print("df[(df.col1>0) & (df.col3<8)]")
print(df[(df.col1>0) & (df.col3<8)])
print('==================================================')
print("df[(df.col1==0) | (df.col3==8)]")
print(df[(df.col1==0) | (df.col3==8)])
print('==================================================')
print("df[df.col1.isin([0,3])]")
print(df[df.col1.isin([0,3])])
print('==================================================')
print("df[df.index.isin(['a'])]")
print(df[df.index.isin(['a'])])
print('==================================================')
print("df[df.isin([0,1,2,3,4])]")
print(df[df.isin([0,1,2,3,4])])
print('==================================================')
print('query')
print("df.query('col1 < 4')")
print(df.query('col1 < 4'))
print('==================================================')
print("x = 4")
x = 4
print("df.query('col1 < @x')")
print(df.query('col1 < @x'))
print('==================================================')
print("l = [3, 6]")
l = [3, 6]
print("df.query('col1 in @l')")
print(df.query('col1 in @l'))
DataFrameに戻る
文字列で取得
print('import pandas as pd')
import pandas as pd
df = pd.DataFrame({'a': ['a_1', 'b_a', '11'],
'b': ['b_1', 'c_b', '22'],
'c': ['c_1', 'a_c', '33']})
print("print(df)")
print(df)
print('=============================')
print('【文字列条件(文字列カラム)】')
print('文字列含む')
print("df['a'].str.contains('a')")
print(df['a'].str.contains('a'))
print('=============================')
print('正規表現')
print('数字を含む')
print("df['b'].str.match('\d')")
print(df['b'].str.match('\d'))
print('=============================')
print('先頭・末尾文字')
print("df['a'].str.startswith('a')")
print(df['a'].str.startswith('a'))
print('=============================')
print("df['a'].str.endswith('a')")
print(df['a'].str.endswith('a'))
print('=============================')
print('分割')
print("df['c'].str.split('_')")
print(df['c'].str.split('_'))
DataFrameに戻る
データ型で取得
print('import pandas as pd')
import pandas as pd
df = pd.DataFrame({'str': ['a', 'b'],
'int': [1, 2],
'date': pd.date_range(start='2022-04-16', end='2022-04-17'),
'bool': [True, False]})
print("print(df)")
print(df)
print('================================')
print("df.select_dtypes(['object'])")
print(df.select_dtypes(['object']))
print('================================')
print("df.select_dtypes(['number'])")
print(df.select_dtypes(['number']))
print('================================')
print("df.select_dtypes(['datetime'])")
print(df.select_dtypes(['datetime']))
print('================================')
print("df.select_dtypes(['bool'])")
print(df.select_dtypes(['bool']))
DataFrameに戻る
for文で取得
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(9).reshape(3,3))
print("print(df)")
print(df)
print('==============================')
print("for i, row in df.iterrows():")
print(" print(i, row.values)")
for i, row in df.iterrows():
print(i, row.values)
DataFrameに戻る
・削除関連
削除関連
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(9).reshape(3,3),
index=['01','02','03'],
columns=['a','b','c'])
print("print(df)")
print(df)
print('======================')
print('行を削除')
print('インデックスの条件')
print("df.drop(['01'])")
print(df.drop(['01']))
print('======================')
print("df[df.index != '01']")
print(df[df.index != '01'])
print('======================')
print('カラム値の条件')
print("df[df['a'] != 0]")
print(df[df['a'] != 0])
print('======================')
print('カラムを削除')
print("df.drop('a', axis=1)")
print(df.drop('a', axis=1))
print('======================')
print("del df['a']")
del df['a']
print("print(df)")
print(df)
DataFrameに戻る
・apply
apply
import pandas as pd
df = pd.DataFrame({'a': [1,2,3], 'b': ['x','y','z']})
print("print(df)")
print(df)
print('===========================================================')
print("print(df['a'].apply(lambda x: str(x*2)+' point'))")
print(df['a'].apply(lambda x: str(x*2)+' point'))
print('===========================================================')
print("print(df['b'].apply(lambda x: 'type_' + x))")
print(df['b'].apply(lambda x: 'type_' + x))
print('===========================================================')
print('行を引数として渡す')
print("def f(x):")
print(" return x[0]*x[1]")
def f(x):
return x[0]*x[1]
print("print(df.apply(lambda x: f(x), axis=1))")
print(df.apply(lambda x: f(x), axis=1))
print('===========================================================')
print("print(df.apply(lambda x: str(x[0]) + '_' + x[1], axis=1))")
print(df.apply(lambda x: str(x[0]) + '_' + x[1], axis=1))
print('===========================================================')
print("データフレームに関数を直接適用")
print("df.applymap(lambda x: len(str(x)))")
print(df.applymap(lambda x: len(str(x))))
DataFrameに戻る
・欠損関連
欠損関連
print('import numpy as np')
import numpy as np
print('import pandas as pd')
import pandas as pd
print("df = pd.DataFrame({'a': [pd.NA, 1, 2, pd.NA], ")
print(" 'b': [np.nan, 5, 6, np.nan]})")
df = pd.DataFrame({'a': [pd.NA, 1, 2, pd.NA],
'b': [np.nan, 5, 6, np.nan]})
print('print(df)')
print(df)
print('======================================================')
print("欠損値を確認")
print("df.isnull()")
print(df.isnull())
print('======================================================')
print('カラムの欠損値の数を確認')
print("df.isnull().sum().sort_values(ascending=False)")
print(df.isnull().sum().sort_values(ascending=False))
print('======================================================')
print('インデックスの欠損値の数を確認')
print("df.isnull().sum(axis=1).sort_values(ascending=False)")
print(df.isnull().sum(axis=1).sort_values(ascending=False))
print('======================================================')
print('欠損値以外を確認')
print("df.notnull()")
print(df.notnull())
print('======================================================')
print('欠損値を埋める')
print("df.fillna(0)")
print(df.fillna(0))
print('======================================================')
print("df.fillna(df.mean())")
print(df.fillna(df.mean()))
print('======================================================')
print("df.fillna(0).iloc[0:, 1:]")
print(df.fillna(0).iloc[0:, 1:])
print('======================================================')
print("df.fillna({'a': 0})")
print(df.fillna({'a': 0}))
print('======================================================')
print('前後方置換')
print("df.fillna(method='ffill')")
print(df.fillna(method='ffill'))
print("df.fillna(method='bfill')")
print(df.fillna(method='bfill'))
print('======================================================')
print('欠損値の削除')
print("df.dropna() (インデックス)")
print(df.dropna())
print('======================================================')
print("df.dropna(axis=1) (カラム)")
print(df.dropna(axis=1))
print('======================================================')
print('カラム指定')
print("df.dropna(subset=['a'])")
print(df.dropna(subset=['a']))
print('======================================================')
print('欠損以外がthresh個以上')
print("df.dropna(thresh=1)")
print(df.dropna(thresh=1))
DataFrameに戻る
・データの連結
データの連結
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(9).reshape(3,3))
print("print(df)")
print(df)
print('====================================================================')
print("行方向")
print("pd.concat([df, df], axis=0)")
print(pd.concat([df, df], axis=0))
print('====================================================================')
print('列方向')
print("pd.concat([df, df], axis=1)")
print(pd.concat([df, df], axis=1))
print('====================================================================')
print('行を追加')
print("df.loc[3] = [9,10,11]")
df.loc[3] = [9,10,11]
print("print(df)")
print(df)
print('====================================================================')
print('merge')
df1 = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6]})
df2 = pd.DataFrame({'b': [4,5], 'c': [7,8]})
print("print(df1)")
print(df1)
print("print(df2)")
print(df2)
print('====================================================================')
print('on:結合基準カラム, how:結合方法')
print("pd.merge(df1, df2, on='b', how='inner')")
print(pd.merge(df1, df2, on='b', how='inner'))
print('====================================================================')
print("print(pd.merge(df1, df2, on='b', how='outer'))")
print(pd.merge(df1, df2, on='b', how='outer'))
print('====================================================================')
print("print(pd.merge(df1, df2, on='b', how='left'))")
print(pd.merge(df1, df2, on='b', how='left'))
print('====================================================================')
print("print(pd.merge(df1, df2, on='b', how='right'))")
print(pd.merge(df1, df2, on='b', how='right'))
print('====================================================================')
print("pd.merge(df1, df2, left_on='b', right_on='b', how='outer')")
print(pd.merge(df1, df2, left_on='b', right_on='b', how='outer'), '\n')
print('====================================================================')
print('suffixes(同名カラムを区別)')
df1 = pd.DataFrame({'a': [1,2,3], 'b': [4,5,6], 'c':[7,8,9]})
df2 = pd.DataFrame({'a': [1,2], 'b': [3,4], 'c': [5,6]})
print("print(df1)")
print(df1)
print("print(df2)")
print(df2)
print("pd.merge(df1, df2, on='b', suffixes=('_df1', '_df2'), how='outer')")
print(pd.merge(df1, df2, on='b', suffixes=('_df1', '_df2'), how='outer'))
DataFrameに戻る
・データの整形
いろいろな整形 重複行を削除 ダミー(カテゴリ)変数 2値分類・ビニング、等分 文字列置換
いろいろな整形
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(9).reshape(3,3), columns=['a','b','c'])
print("print(df)")
print(df)
print('===========================================================')
print('引数:残すカラム名, 元のカラム名, 値のカラム名')
print("df.melt(id_vars='a', var_name='melt' ,value_name='value')")
print(df.melt(id_vars='a', var_name='melt' ,value_name='value'))
print('===========================================================')
print('unstack(行を列へ)')
print(df.unstack())
print('===========================================================')
print('stack(列を行へ)')
print(df.stack())
print('===========================================================')
print('転置')
print("df.T")
print(df.T)
print('===========================================================')
print("df.transpose()")
print(df.transpose())
print('===========================================================')
print('pivot')
print("df.pivot(index='a', columns='b', values='c')")
print(df.pivot(index='a', columns='b', values='c'))
DataFrameに戻る
重複行を削除
import pandas as pd
df = pd.DataFrame([[1,2,3],[1,2,3],[4,5,6],[7,8,9],[7,8,9]])
print('print(df)')
print(df)
print('======================')
print('重複行')
print("df[df.duplicated()]")
print(df[df.duplicated()])
print('======================')
print('重複行削除')
print("df[~df.duplicated()]")
print(df[~df.duplicated()])
print('======================')
print("df.drop_duplicates()")
print(df.drop_duplicates())
DataFrameに戻る
ダミー(カテゴリ)変数
import pandas as pd
df = pd.DataFrame({'abc': ['a','b','c','b','a'],
'num': [1,2,3,4,5],
'xyz': ['x','x','y','z','y']})
print("print(df)")
print(df)
print('======================================')
print("pd.get_dummies(df['abc'])")
print(pd.get_dummies(df['abc']))
print('======================================')
print("pd.get_dummies(df['xyz'], dtype=int)")
print(pd.get_dummies(df['xyz'], dtype=int))
print('======================================')
print("pd.get_dummies(df['num'], dtype=str)")
pd.get_dummies(df['num'], dtype=str)
DataFrameに戻る
2値分類・ビニング、等分
print('import pandas as pd')
import pandas as pd
print('分割数指定')
print("pd.qcut(range(10), 2), labels=False)")
print(pd.cut(range(10), 2, labels=False))
print('=============================================================')
print('閾値指定')
print("right:True(], False[), (:<, ]:<=")
print("pd.cut(range(10), [0, 5, 10], right=False, labels=False)")
print(pd.cut(range(10), [0, 5, 10], right=False, labels=False))
print('=============================================================')
print("pd.cut(range(10), [0, 3, 6, 10], right=False, labels=False)")
print(pd.cut(range(10), [0, 3, 6, 10], right=False, labels=False))
DataFrameに戻る
文字列置換
import pandas as pd
df = pd.DataFrame({'s': ['abc', 'def', 'ghi'],
'd': ['123', '456', '789']})
print('print(df)')
print(df)
print('========================================================')
print("df['d'].replace(['123', '456'], ['abc', 'def'])")
print(df['d'].replace(['123', '456'], ['abc', 'def']))
print('========================================================')
print("df['d'].replace({'123': 'abc'})")
print(df['d'].replace({'123': 'abc'}))
print('========================================================')
print('正規表現')
print("df['s'].replace(r'[a-zA-Z]+', 'alphabet', regex=True)")
print(df['s'].replace(r'[a-zA-Z]+', 'alphabet', regex=True))
print('========================================================')
print('一部文字')
print("df['s'].str.replace('a', 'A')")
print(df['s'].str.replace('a', 'A'))
DataFrameに戻る
・Groupby
Groupby
import pandas as pd
df = pd.DataFrame({'a': [1,2,3,4,5],
'b': ['x','y','x','y','x'],
'c': [100,110,120,130,140],
'd': [0,1,1,1,0],
'e': [35,60,50,75,45]})
print("print(df)")
print(df)
print('==================================================================')
print("df.groupby('b').groups")
print(df.groupby('b').groups)
print('==================================================================')
print("df.groupby('b').get_group('x')")
print(df.groupby('b').get_group('x'))
print('==================================================================')
print("for val, data in df.groupby('b'):")
print(" print(val)")
print(" print(data)")
for val, data in df.groupby('b'):
print(val)
print(data)
print('==================================================================')
print("df.groupby('b').mean()")
print(df.groupby('b').mean())
print('==================================================================')
print('aggregate')
print("df.aggregate(['sum', 'min', 'max'])")
print(df.aggregate(['sum', 'min', 'max']))
print('==================================================================')
print("df.groupby('b')['d'].value_counts()")
print(df.groupby('b')['d'].value_counts())
print('==================================================================')
print("df.groupby('b').agg({'c': 'mean', 'd': 'sum', 'e': 'max'})")
print(df.groupby('b').agg({'c': 'mean', 'd': 'sum', 'e': 'max'}))
print('==================================================================')
print('自作関数')
print("def func(x):")
print(" return sum(x)")
def func(x):
return sum(x)
print("df.groupby('b').agg([func, lambda x: sum(x)])")
print(df.groupby('b').agg([func, lambda x: sum(x)]))
print('==================================================================')
print("df.groupby('b').filter(lambda x: x['e'].mean() > 50)")
print(df.groupby('b').filter(lambda x: x['e'].mean() > 50))
DataFrameに戻る
・処理速度
カラムの和
# juputer labで計測
import numpy as np
import pandas as pd
data = np.random.random((1000000, 2))
df = pd.DataFrame(data, columns=['col_1','col_2'])
print('おおむね速い順')
print("res = df['col_1'] + df['col_2']")
%time res = df['col_1'] + df['col_2']
print("res = df.col_1 + df.col_2")
%time res = df.col_1 + df.col_2
print("res = df.eval('col_1 + col_2')")
%time res = df.eval('col_1 + col_2')
print("res = df.values.sum(axis=1)")
%time res = df.values.sum(axis=1)
print("res = np.sum(df.values, axis=1)")
%time res = np.sum(df.values, axis=1)
print("res = np.sum(df, axis=1)")
%time res = np.sum(df, axis=1)
print("res = df.sum(axis=1)")
%time res = df.sum(axis=1)
print("res = df.apply(lambda row: row['col_1'] + row['col_2'], axis=1)")
%time res = df.apply(lambda row: row['col_1'] + row['col_2'], axis=1)
DataFrameに戻る
・プロット
いろいろなプロット
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')
# ライン
df = pd.DataFrame(np.sin(np.arange(100)/10), columns=['sin'])
df.plot.line()
# 散布図
df = pd.DataFrame(np.random.randn(100,2))
df.plot.scatter(x=0, y=1);
# ヒストグラム
df = pd.DataFrame(np.random.randn(100), columns=['randn'])
df.plot.hist(bins=20, ec='b');
# x軸共有
df = pd.DataFrame({'x': np.arange(100),
'x**2': np.arange(100)**2})
df[['x', 'x**2']].plot(secondary_y='x**2');
DataFrameに戻る
サブプロット(複数)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('dark_background')
df = pd.DataFrame({'sin': np.sin(np.arange(100)/10),
'cos': np.cos(np.arange(100)/10)})
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1,2,1, title='sin')
df['sin'].plot()
ax2 = fig.add_subplot(1,2,2, title='cos')
df['cos'].plot();
DataFrameに戻る
・保存・読み出し
各種ファイルの保存・読み出し
# !pip install openpyxl
import numpy as np
import pandas as pd
df = pd.DataFrame(np.arange(10).reshape(5,2))
print('pickle')
df.to_pickle('df.pkl')
df = pd.read_pickle('df.pkl')
print(df, '\n')
print('csv')
df.to_csv('df.csv', index=False)
df = pd.read_csv('df.csv')
print(df, '\n')
print('excel')
df.to_excel('df.xlsx', sheet_name='Sheet1', index=False)
df = pd.read_excel('df.xlsx', sheet_name='Sheet1')
print(df, '\n')
print('json')
df.to_json('df.json')
df = pd.read_json('df.json')
print(df)
DataFrameに戻る
・表示
外部リンク:https://pandas.pydata.org/docs/user_guide/options.html
表示設定
表示設定
import numpy as np
import pandas as pd
df = pd.DataFrame(np.random.randn(7,7))
print("print(df)")
print(df)
print('=========================================================================')
print("pd.options.display.max_rows = 5")
pd.options.display.max_rows = 5
print("pd.set_option('display.max_columns', 5)")
pd.set_option('display.max_columns', 5)
print("pd.options.display.precision = 6")
pd.options.display.precision = 6
print("print(df)")
print(df)
print('=========================================================================')
print('表示する行数を設定(default:60)')
print('全行表示')
print("pd.options.display.max_rows = None")
pd.options.display.max_rows = None
print(df)
print('=========================================================================')
print('表示するカラム数を設定(default:0)')
print('全カラム表示')
print("pd.set_option('display.max_columns', None)")
pd.set_option('display.max_columns', None)
print(df, '\n')
print('=========================================================================')
print('小数点以下表示(default:6)')
print("pd.options.display.precision = 3")
pd.options.display.precision = 3
print(df, '\n')
print('=========================================================================')
print('カラム内表示を設定(default:50)')
df = pd.DataFrame({'text': ['abcdefghijklmnopqrstuvwxyz']})
print("print(df)")
print(df)
print("pd.set_option('display.max_colwidth', 10)")
pd.set_option('display.max_colwidth', 10)
print("print(df)")
print(df)
print('=========================================================================')
print('全て表示')
print("pd.set_option('display.max_colwidth', None)")
pd.set_option('display.max_colwidth', None)
print("print(df)")
print(df)
DataFrameに戻る