python - Comment puis-je sélectionner des lignes à partir d'une DataFrame basée sur des valeurs de colonne

Mots clés : pythonpandasdataframepython

meilleur 5 Réponses python - Comment puis-je sélectionner des lignes à partir d'une DataFrame basée sur des valeurs de colonne

vote vote

96

df.loc[df['column_name'] == some_value] 
df.loc[df['column_name'].isin(some_values)] 
df.loc[(df['column_name'] >= A) & (df['column_name'] <= B)] 
df['column_name'] >= A & df['column_name'] <= B 
df['column_name'] >= (A & df['column_name']) <= B 
df.loc[df['column_name'] != some_value] 
df.loc[~df['column_name'].isin(some_values)] 
import pandas as pd import numpy as np df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),                    'B': 'one one two three two two one three'.split(),                    'C': np.arange(8), 'D': np.arange(8) * 2}) print(df) #      A      B  C   D # 0  foo    one  0   0 # 1  bar    one  1   2 # 2  foo    two  2   4 # 3  bar  three  3   6 # 4  foo    two  4   8 # 5  bar    two  5  10 # 6  foo    one  6  12 # 7  foo  three  7  14  print(df.loc[df['A'] == 'foo']) 
     A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
print(df.loc[df['B'].isin(['one','three'])]) 
     A      B  C   D 0  foo    one  0   0 1  bar    one  1   2 3  bar  three  3   6 6  foo    one  6  12 7  foo  three  7  14 
df = df.set_index(['B']) print(df.loc['one']) 
       A  C   D B               one  foo  0   0 one  bar  1   2 one  foo  6  12 
df.loc[df.index.isin(['one','two'])] 
       A  C   D B               one  foo  0   0 one  bar  1   2 two  foo  2   4 two  foo  4   8 two  bar  5  10 one  foo  6  12 
vote vote

82

import pandas as pd, numpy as np  df = pd.DataFrame({'A': 'foo bar foo bar foo bar foo foo'.split(),                    'B': 'one one two three two two one three'.split(),                    'C': np.arange(8), 'D': np.arange(8) * 2}) 
mask = df['A'] == 'foo' 
df[mask]       A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
mask = df['A'] == 'foo' pos = np.flatnonzero(mask) df.iloc[pos]       A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
df.set_index('A', append=True, drop=False).xs('foo', level=1)       A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
df.query('A == "foo"')       A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
mask = df['A'].values == 'foo' 
%timeit mask = df['A'].values == 'foo' %timeit mask = df['A'] == 'foo'  5.84 µs ± 195 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) 166 µs ± 4.45 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 
mask = df['A'].values == 'foo' %timeit df[mask] mask = df['A'] == 'foo' %timeit df[mask]  219 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 239 µs ± 7.03 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 
pd.DataFrame(df.values[mask], df.index[mask], df.columns).astype(df.dtypes) 
%timeit df[m] %timeit pd.DataFrame(df.values[mask], df.index[mask], df.columns).astype(df.dtypes)  216 µs ± 10.4 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 1.43 ms ± 39.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 
np.random.seed([3,1415]) d1 = pd.DataFrame(np.random.randint(10, size=(10, 5)), columns=list('ABCDE'))  d1     A  B  C  D  E 0  0  2  7  3  8 1  7  0  6  8  6 2  0  2  0  4  9 3  7  3  2  4  3 4  3  6  7  7  4 5  5  3  7  5  9 6  8  7  6  4  7 7  6  2  6  6  5 8  2  8  7  5  8 9  4  7  6  1  5 
%%timeit mask = d1['A'].values == 7 d1[mask]  179 µs ± 8.73 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 
%%timeit mask = d1['A'].values == 7 pd.DataFrame(d1.values[mask], d1.index[mask], d1.columns)  87 µs ± 5.12 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each) 
mask = df['A'].isin(['foo']) df[mask]       A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
mask = np.in1d(df['A'].values, ['foo']) df[mask]       A      B  C   D 0  foo    one  0   0 2  foo    two  2   4 4  foo    two  4   8 6  foo    one  6  12 7  foo  three  7  14 
res.div(res.min())                           10        30        100       300       1000      3000      10000     30000 mask_standard         2.156872  1.850663  2.034149  2.166312  2.164541  3.090372  2.981326  3.131151 mask_standard_loc     1.879035  1.782366  1.988823  2.338112  2.361391  3.036131  2.998112  2.990103 mask_with_values      1.010166  1.000000  1.005113  1.026363  1.028698  1.293741  1.007824  1.016919 mask_with_values_loc  1.196843  1.300228  1.000000  1.000000  1.038989  1.219233  1.037020  1.000000 query                 4.997304  4.765554  5.934096  4.500559  2.997924  2.397013  1.680447  1.398190 xs_label              4.124597  4.272363  5.596152  4.295331  4.676591  5.710680  6.032809  8.950255 mask_with_isin        1.674055  1.679935  1.847972  1.724183  1.345111  1.405231  1.253554  1.264760 mask_with_in1d        1.000000  1.083807  1.220493  1.101929  1.000000  1.000000  1.000000  1.144175 
res.T.plot(loglog=True) 
def mask_standard(df):     mask = df['A'] == 'foo'     return df[mask]  def mask_standard_loc(df):     mask = df['A'] == 'foo'     return df.loc[mask]  def mask_with_values(df):     mask = df['A'].values == 'foo'     return df[mask]  def mask_with_values_loc(df):     mask = df['A'].values == 'foo'     return df.loc[mask]  def query(df):     return df.query('A == "foo"')  def xs_label(df):     return df.set_index('A', append=True, drop=False).xs('foo', level=-1)  def mask_with_isin(df):     mask = df['A'].isin(['foo'])     return df[mask]  def mask_with_in1d(df):     mask = np.in1d(df['A'].values, ['foo'])     return df[mask] 
res = pd.DataFrame(     index=[         'mask_standard', 'mask_standard_loc', 'mask_with_values', 'mask_with_values_loc',         'query', 'xs_label', 'mask_with_isin', 'mask_with_in1d'     ],     columns=[10, 30, 100, 300, 1000, 3000, 10000, 30000],     dtype=float )  for j in res.columns:     d = pd.concat([df] * j, ignore_index=True)     for i in res.index:a         stmt = '{}(d)'.format(i)         setp = 'from __main__ import d, {}'.format(i)         res.at[i, j] = timeit(stmt, setp, number=50) 
spec.div(spec.min())                       10        30        100       300       1000      3000      10000     30000 mask_with_values  1.009030  1.000000  1.194276  1.000000  1.236892  1.095343  1.000000  1.000000 mask_with_in1d    1.104638  1.094524  1.156930  1.072094  1.000000  1.000000  1.040043  1.027100 reconstruct       1.000000  1.142838  1.000000  1.355440  1.650270  2.222181  2.294913  3.406735 
spec.T.plot(loglog=True) 
np.random.seed([3,1415]) d1 = pd.DataFrame(np.random.randint(10, size=(10, 5)), columns=list('ABCDE'))  def mask_with_values(df):     mask = df['A'].values == 'foo'     return df[mask]  def mask_with_in1d(df):     mask = np.in1d(df['A'].values, ['foo'])     return df[mask]  def reconstruct(df):     v = df.values     mask = np.in1d(df['A'].values, ['foo'])     return pd.DataFrame(v[mask], df.index[mask], df.columns)  spec = pd.DataFrame(     index=['mask_with_values', 'mask_with_in1d', 'reconstruct'],     columns=[10, 30, 100, 300, 1000, 3000, 10000, 30000],     dtype=float ) 
for j in spec.columns:     d = pd.concat([df] * j, ignore_index=True)     for i in spec.index:         stmt = '{}(d)'.format(i)         setp = 'from __main__ import d, {}'.format(i)         spec.at[i, j] = timeit(stmt, setp, number=50) 
vote vote

72

select * from table where column_name = some_value 
table[table.column_name == some_value] 
table[(table.column_name == some_value) | (table.column_name2 == some_value2)] 
table.query('column_name == some_value | column_name2 == some_value2') 
import pandas as pd  # Create data set d = {'foo':[100, 111, 222],      'bar':[333, 444, 555]} df = pd.DataFrame(d)  # Full dataframe: df  # Shows: #    bar   foo # 0  333   100 # 1  444   111 # 2  555   222  # Output only the row(s) in df where foo is 222: df[df.foo == 222]  # Shows: #    bar  foo # 2  555  222 
df[(df.foo == 222) | (df.bar == 444)] #    bar  foo # 1  444  111 # 2  555  222 
df.query('foo == 222 | bar == 444') 
vote vote

63

In [167]: n = 10  In [168]: df = pd.DataFrame(np.random.rand(n, 3), columns=list('abc'))  In [169]: df Out[169]:            a         b         c 0  0.687704  0.582314  0.281645 1  0.250846  0.610021  0.420121 2  0.624328  0.401816  0.932146 3  0.011763  0.022921  0.244186 4  0.590198  0.325680  0.890392 5  0.598892  0.296424  0.007312 6  0.634625  0.803069  0.123872 7  0.924168  0.325076  0.303746 8  0.116822  0.364564  0.454607 9  0.986142  0.751953  0.561512  # pure python In [170]: df[(df.a < df.b) & (df.b < df.c)] Out[170]:            a         b         c 3  0.011763  0.022921  0.244186 8  0.116822  0.364564  0.454607  # query In [171]: df.query('(a < b) & (b < c)') Out[171]:            a         b         c 3  0.011763  0.022921  0.244186 8  0.116822  0.364564  0.454607 
exclude = ('red', 'orange') df.query('color not in @exclude') 
vote vote

55

# Example dataframe df = pd.DataFrame({'Sender email':['ex@example.com', "reply@shop.com", "buy@shop.com"]})       Sender email 0  ex@example.com 1  reply@shop.com 2    buy@shop.com 
df.query('`Sender email`.str.endswith("@shop.com")') 
     Sender email 1  reply@shop.com 2    buy@shop.com 
domain = 'shop.com' df.query('`Sender email`.str.endswith(@domain)') 
     Sender email 1  reply@shop.com 2    buy@shop.com 

Questions similaires