Pandas:修订间差异
无编辑摘要 |
无编辑摘要 |
||
第69行: | 第69行: | ||
==== | ==查看数据== | ||
{| class="wikitable" | 表示例中s为一个Series对象,df为一个DataFrame对象: | ||
<syntaxhighlight lang="python" > | |||
>>> s = pd.Series(['a', 'b', 'c']) | |||
>>> s | |||
0 a | |||
1 b | |||
2 c | |||
dtype: object | |||
>>> df = pd.DataFrame([['foo', 22], ['bar', 25], ['test', 18]],columns=['name', 'age']) | |||
>>> df | |||
</syntaxhighlight> | |||
{| class="wikitable" | |||
|- | |- | ||
!属性/方法 | !属性/方法 | ||
第78行: | 第91行: | ||
!示例 | !示例 | ||
|- | |- | ||
| | | head() | ||
| | | 返回前n行数据,默认前5行 | ||
|Series. | | Series.head(n=5) | ||
|DataFrame. | | DataFrame.head(n=5) | ||
| | | <code>df.head()</code>返回df前5行数据<br \><code>df.head(10)</code>返回df前10行数据。 | ||
|- | |- | ||
| | | tail() | ||
| | | 返回最后n行数据,默认最后5行 | ||
| | | Series.tail(n=5) | ||
| DataFrame.tail(n=5) | |||
| | | <code>df.tail()</code>返回df最后5行数据<br \><code>df.tail(10)</code>返回df最后10行数据。 | ||
| | |||
|- | |- | ||
| | | dtypes | ||
| | | 返回数据的Numpy数据类型(dtype对象) | ||
|Series. | |Series.index | ||
|DataFrame. | |DataFrame.index | ||
| | | <code>s.dtypes</code><br \> <code>df.dtypes</code> | ||
|- | |- | ||
| | | dtype | ||
| | | 返回数据的Numpy数据类型(dtype对象) | ||
|Series. | | Series.index | ||
| | | − | ||
| | | <code>s.dtype</code> | ||
|- | |- | ||
| | | array | ||
| | | 返回 Series 或 Index 数据的数组,该数组为pangdas扩展的python数组. | ||
|Series. | | Series.index | ||
| − | | − | ||
| <code>s.array</code> <br \>返回:<PandasArray><br \>['a', 'b', 'c']<br \>Length: 3, dtype: object | |||
|<code> | |||
|- | |- | ||
| | | attrs | ||
| | | 此对象全局属性字典。 | ||
|Series. | | Series.attrs | ||
|DataFrame. | | DataFrame.attrs | ||
|<code>s. | | <code>s.attrs</code>返回{} | ||
|- | |- | ||
| | | hasnans | ||
| | | 如果有任何空值(如Python的None,np.NaN)返回True,否则返回False。 | ||
|Series. | | Series.hasnans | ||
| | | − | ||
| | | <code>s.hasnans</code> <br \>返回False | ||
|- | |- | ||
| | | values | ||
| | | 返回ndarray(NumPy的多维数组)或类似ndarray的形式。 | ||
|Series. | | Series.values | ||
|DataFrame. | | DataFrame.values | ||
| <code>s.values</code>返回array(['a', 'b', 'c'], dtype=object) | |||
|- | |- | ||
| | | ndim | ||
| | | 返回数据的维数,Series返回1,DataFrame返回2 | ||
|Series. | | Series.ndim | ||
|DataFrame. | | DataFrame.ndim | ||
| | | <code>s.ndim</code>返回1 <br \><code>df.ndim</code>返回2 | ||
|- | |- | ||
| | | size | ||
| | | 返回数据中元素的个数 | ||
|Series. | | Series.size | ||
|DataFrame. | | DataFrame.size | ||
|<code> | | <code>s.size</code>返回3 <br \><code>df.ndim</code>返回6 | ||
|- | |- | ||
| | | shape | ||
| | | 返回数据形状(行数和列数)的元组 | ||
|Series. | | Series.shape | ||
|DataFrame. | | DataFrame.shape | ||
| <code>s.shape</code>返回(3, ) <br \><code>df.shape</code>返回(3, 2) | |||
|- | |- | ||
| | | empty | ||
| | | 返回是否为空,为空返回Ture | ||
|Series. | | Series.empty | ||
|DataFrame. | | DataFrame.empty | ||
| | | <code>s.empty</code>返回False <br \><code>df.empty</code>返回False | ||
|- | |- | ||
| | | name | ||
| | | 返回Series的名称。 | ||
|Series. | | Series.name | ||
| | | − | ||
| | | <code>s.name</code>返回空 | ||
|- | |- | ||
| | | memory_usage() | ||
| | | 返回Series或DataFrame的内存使用情况,单位Bytes。参数index默认为True,表示包含index。<br \>参数deep默认为False,表示不通过查询dtypes对象来深入了解数据的系统级内存使用情况 | ||
|Series. | | Series.memory_usage(index=True, deep=False) | ||
|DataFrame. | | DataFrame.memory_usage(index=True, deep=False) | ||
|<code>df. | | <code>s.memory_usage()</code>返回空152 <br \><code>df.memory_usage(index=False)</code> | ||
|- | |- | ||
| | | info() | ||
| | | 打印DataFrame的简要信息。 | ||
| | | − | ||
|DataFrame. | | DataFrame.info(verbose=True, buf=None, max_cols=None, memory_usage=True, null_counts=True) | ||
|<code>df. | | <code>df.info()</code> | ||
|- | |- | ||
| | | select_dtypes() | ||
| | | 根据列的dtypes返回符合条件的DataFrame子集 | ||
| | | − | ||
| | | DataFrame.select_dtypes(include=None, exclude=None) | ||
| <code>df.select_dtypes(include=['float64'])</code> | |||
|- | |- | ||
| | |} | ||
==索引== | |||
===查看索引=== | |||
| | {| class="wikitable" | ||
|- | |- | ||
|idxmax() | !属性/方法 | ||
| | !描述 | ||
!Series | |||
!DataFrame | |||
!示例 | |||
|- | |||
| index | |||
| 索引(行标签),可以查看和设置 | |||
|Series.index | |||
|DataFrame.index | |||
| <code>s.index</code>返回RangeIndex(start=0, stop=3, step=1) <br \> <code>df.index</code> | |||
|- | |||
| columns | |||
| 列标签,Series无,可以查看和设置 | |||
| − | |||
|DataFrame.columns | |||
| <code>df.columns</code> | |||
|- | |||
| keys() | |||
| 列标签,没有就返回索引 | |||
| Series.keys() | |||
| DataFrame.keys() | |||
| <code>df.keys()</code>返回列标签 | |||
|- | |||
| axes | |||
| 返回轴标签(行标签和列标签)的列表。<br \>Series返回[index] <br \>DataFrame返回[index, columns] | |||
| Series.axes | |||
| DataFrame.axes | |||
| <code>s.axes</code>返回[RangeIndex(start=0, stop=3, step=1)] <br \><code>df.axes</code>返回索引和列名。 | |||
|- | |||
|idxmax() | |||
|返回第一次出现最大值的索引位置。 | |||
|Series.idxmax(axis=0, skipna=True, *args, **kwargs) | |Series.idxmax(axis=0, skipna=True, *args, **kwargs) | ||
|DataFrame.idxmax(axis=0, skipna=True) | |DataFrame.idxmax(axis=0, skipna=True) | ||
第211行: | 第230行: | ||
|- | |- | ||
|idxmin() | |idxmin() | ||
| | |返回第一次出现最小值的索引位置。 | ||
|Series.idxmin(axis=0, skipna=True, *args, **kwargs) | |Series.idxmin(axis=0, skipna=True, *args, **kwargs) | ||
|DataFrame.idxmin(axis=0, skipna=True) | |DataFrame.idxmin(axis=0, skipna=True) | ||
|<code>s.idxmin()</code> | |<code>s.idxmin()</code> | ||
|} | |||
===设置与重置索引=== | |||
Series对象和DataFrame对象可以通过<code>.index</code>或<code>.columns</code>属性设置,还可以通过以下方法来设置与重置。 | |||
{| class="wikitable" | |||
|- | |- | ||
| | !属性/方法 | ||
| | !描述 | ||
|Series.reindex(index=None, | !Series | ||
|DataFrame.reindex( | !DataFrame | ||
!示例 | |||
|- | |||
|set_index() | |||
|将某列设置为索引 | |||
| − | |||
|DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False) | |||
|<code>df.set_index('col_3')</code>将‘col_3’列设置为索引。 | |||
|- | |||
|reset_index() | |||
|重置索引,默认从0开始整数。参数:<br \><code>drop</code>是否删除原索引,默认不删除 <br \><code>level</code>重置多索引的一个或多个级别。 | |||
|Series.reset_index(level=None, drop=False, name=None, inplace=False) | |||
|DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='') | |||
| | |||
|- | |||
|reindex() | |||
| 用Series或DataFrame匹配新索引。对于新索引有旧索引无的默认使用NaN填充,新索引无旧索引有的删除。 | |||
|Series.reindex(index=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) | |||
|DataFrame.reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) | |||
| | | | ||
|- | |- | ||
第229行: | 第272行: | ||
|- | |- | ||
|rename() | |rename() | ||
| | |修改轴(索引或列)标签。 | ||
|Series.rename(index=None, *, axis=None, copy=True, inplace=False, level=None, errors='ignore') | |Series.rename(index=None, *, axis=None, copy=True, inplace=False, level=None, errors='ignore') | ||
|DataFrame.rename( | |DataFrame.rename(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None, errors='ignore') | ||
| | | | ||
|- | |- | ||
第238行: | 第281行: | ||
|Series.rename_axis(**kwargs) | |Series.rename_axis(**kwargs) | ||
|DataFrame.rename_axis(**kwargs) | |DataFrame.rename_axis(**kwargs) | ||
| | | | ||
|- | |- | ||
第264行: | 第289行: | ||
|<code>df.set_axis(['a', 'b', 'c'], axis='index')</code><br \><code>df.set_axis(['I', 'II'], axis='columns')</code> | |<code>df.set_axis(['a', 'b', 'c'], axis='index')</code><br \><code>df.set_axis(['I', 'II'], axis='columns')</code> | ||
|- | |- | ||
| | |add_prefix() | ||
| | |索引或列标签添加前缀 | ||
|Series. | |Series.add_prefix(prefix) | ||
|DataFrame. | |DataFrame.add_prefix(prefix) | ||
| | |<code>s.add_prefix('item_')</code> <br \><code>df.add_prefix('col_')</code> | ||
|- | |- | ||
| | |add_suffix() | ||
| | |索引或列标签添加后缀 | ||
|Series. | |Series.add_suffix(suffix) | ||
|DataFrame. | |DataFrame.add_suffix(suffix) | ||
| | | | ||
|} | |} | ||
==== | |||
{| class="wikitable" | ==数据选取与迭代== | ||
===概览=== | |||
{| class="wikitable" style="width: 100%; | |||
|- | |||
! 方法 | |||
! 描述 | |||
! 示例 | |||
|- | |- | ||
|索引运算符 <br \><code>[ ]</code> | |||
|Python中序列对象使用<code>self[key]</code>是在调用对象的特殊方法<code>__getitem__()</code> 。Python运算符<code>[ ]</code>有3种通用序列操作:<br \> <code>self[i]</code> 取第i项(起始为0)<br \> <code>self[i:j]</code> 从 i 到 j 的切片<br \> <code>self[i:j:k]</code> s 从 i 到 j 步长为 k 的切片 <br \>Pandas支持NumPy扩展的一些操作:<br \><code>self[布尔索引]</code>,如s[s>5] | |||
|<code>s[1]</code> 取s的第二个值<br \> <code>df[1:-1]</code>切片,返回df第二行到倒数第二行组成的DataFrame对象 | |||
|- | |- | ||
| | |属性运算符<br \><code>.</code> | ||
|同Python字典属性获取 | |||
|<code>df.a</code>返回df的名称为a的列 | |||
|- | |- | ||
| | |按标签选择 <br \><code>loc[ ]</code> | ||
| | |通过对象调用<code>.loc</code>属性生成序列对象,序列对象调用索引运算符<code>[]</code>。 | ||
| | |<code>df.loc[2]</code>选取索引(行标签)值为2的行 <br \><code>df.loc[1:2]</code> 选取索引值为1到2的行 <br \><code><nowiki>df.loc[[1,2]]</nowiki></code>选取索引值为1和2的行 <br \><code>df.loc[1,'name']</code>选取行标签值为1,列标签值为'name'的单个值<br \><code>df.loc[[1:2],'name']</code>选取行标签值为1到2,列标签值为'name'的数据 | ||
|- | |- | ||
| | |按位置选择 <br \><code>iloc[ ]</code> | ||
| | |纯粹基于整数位置的索引方法,通过对象调用<code>.iloc</code>属性生成序列对象,然后序列对象调用索引运算符<code>[]</code>。 | ||
| | |<code>s.iloc[2]</code>选取行标签位置为2的行 <br \><code>s.iloc[:2]</code> 选取索引为0到2(不包含2)的值 <br \><code><nowiki>s.iloc[[True,False,True]]</nowiki></code>选取索引位置为True的值 <br \><code>s.iloc[lambda x: x.index % 2 == 0]</code>选取索引为双数的值 | ||
|- | |- | ||
| | |按标签选择单个 <br \><code>at[ ]</code> | ||
| | |通过行轴和列轴标签对获取或设置单个值。 | ||
|<code>s.at[1]</code>返回'b'<br \><code>s.at[2]='d'</code>设置索引位置为第三的值等于'd' <br \><code>df.at[2, 'name']'</code>获取index=2,columns='name'点的值 | |||
|- | |- | ||
| | |按位置选择单个 <br \><code>iat[ ]</code> | ||
| | |通过行轴和列轴整数位置获取或设置单个值。 | ||
| Series. | |<code>s.iat[1]</code><br \><code>s.iat[2]='d'</code> | ||
| DataFrame. | |- | ||
| | |查询方法 <br \><code>query()</code> | ||
| DataFrame对象query()方法,使用表达式进行选择。<br \><code>DataFrame.query(expr, inplace=False, **kwargs)</code> | |||
|<code>df.query('A > B')</code>相当于<code>df[df.A > df.B]</code> | |||
|- | |||
|通过行列标签筛选 <br \><code>filter()</code> | |||
|通过行列标签筛选 <br \> <code>Series.filter(items=None, like=None, regex=None, axis=None)</code> <br \> <code>DataFrame.filter(items=None, like=None, regex=None, axis=None)</code> | |||
|<code>df.filter(like='bbi', axis=0)</code>选取行标签包含'bbi'的行。 | |||
|- | |||
|多索引选择 <br \><code>xs()</code> | |||
| 只能用于选择数据,不能设置值。可以使用<code>iloc[ ]</code>或<code>loc[ ]</code>替换。<br \><code>Series.xs(key, axis=0, level=None, drop_level=True)</code> <br \> <code>DataFrame.xs(key, axis=0, level=None, drop_level=True)</code> | |||
| df.xs('a', level=1) | |||
|- | |||
| 选择一列 <br \>get() | |||
| 选择某一列 <br \> <code>Series.get(key, default=None) </code> <br \> <code>DataFrame.get(key, default=None)</code> | |||
| <code>df.get('a')</code>返回a列 | |||
|- | |- | ||
| | | 选择指定标签列并删除 <br \><code>pop()</code> | ||
| | | 返回某一列,并从数据中删除,如果列名没找到抛出KeyError。<br \> <code>Series.pop(item) </code> <br \> <code>DataFrame.pop(item) </code> | ||
| | |<code> df.pop('a')</code>返回a列并从df中删除。 | ||
|- | |- | ||
|- | |- | ||
| | | 删除指定标签列 <br \><code>drop()</code> | ||
| 返回删除指定标签列后的数据 <br \> <code>Series.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')</code> <br \> <br \> <code>DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise') </code> | |||
| Series. | |||
| | | | ||
|- | |- | ||
| | | 抽样 <br \><code>sample()</code> | ||
| | | 返回抽样数据 <br \> <code>Series.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None) </code> <br \><br \> <code>DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)</code> | ||
| | | | ||
|} | |||
{{了解更多 | |||
|[https://pandas.pydata.org/docs/user_guide/indexing.html Pandas 指南:索引与选择数据] | |||
|[https://docs.python.org/zh-cn/3/library/stdtypes.html#common-sequence-operations Python 3 文档:序列类型 - 通用序列操作] | |||
|[https://docs.python.org/zh-cn/3/reference/datamodel.html#special-method-names Python 3 文档:数据模型 - 特殊方法名称] | |||
|[https://numpy.org/doc/stable/user/absolute_beginners.html#indexing-and-slicing NumPy 文档:初学者基础知识 - 索引和切片] | |||
}} | |||
===按标签选择=== | |||
pandas提供基于标签的索引方法,通过对象调用<code>.loc</code>属性生成序列对象,序列对象调用索引运算符<code>[]</code>。该方法严格要求,每个标签都必须在索引中,否则会抛出KeyError错误。切片时,如果索引中存在起始边界和终止边界,则都将包括在内。整数是有效的标签,但它们引用的是标签,而不是位置(索引顺序)。 | |||
{| class="wikitable" style="width: 100%; | |||
|- | |- | ||
! .loc索引输入值 | |||
! 描述 | |||
! Series示例 | |||
! DataFrame示例 | |||
|- | |- | ||
| | |单个标签 | ||
| | |例如5或'a'(注意,5被解释为索引的标签,而不是整数位置。) | ||
| | |<code>s.loc['a']</code> 返回s索引为'a'的值 | ||
| | |<code>df.loc['b']</code> 返回df索引(行标签)为'b'的行(Series对象) | ||
|- | |- | ||
| | |标签列表或标签数组 | ||
| | |如['a', 'c'](注意:这种方式会有两组方括号<code><nowiki>[[]]</nowiki></code>,里面是生成列表,外面是索引取值操作) | ||
| | |<code><nowiki>s.loc[['a', 'c']]</nowiki></code>返回s索引为'a'和'c'的值(Series对象) | ||
| | |<code><nowiki>df.loc[['a', 'c']]</nowiki></code>返回df索引(行标签)为'a'和'c'的行(DataFrame对象) | ||
|- | |- | ||
| | |带标签的切片对象 | ||
| | |切片如 'a':'f'表示标签'a'到标签'f',步长切片如 'a':'f':2表示标签'a'到标签'f'按步长2选取(注意:和Python切片不同,这里包含开始标签和结束标签),还有一些常用示例如:<br \><code>'f':</code>从标签'f'开始到最后<br \><code>:'f'</code>从最开始到标签'f'<br \><code>:</code>全部标签 | ||
| | |<code>s.loc[a:c]</code> 返回s索引'a'到'c'的值 | ||
| | |<code>df.loc[b:f]</code> 返回df索引(行标签)'b'到'f'的行(DataFrame对象) | ||
|- | |- | ||
| | |行标签,列标签 | ||
| | |只有DataFrame可用,格式<code>行标签,列标签</code>,行标签或列标签可以使用切片或数组等。 | ||
| | |− | ||
| | |<code>df.loc['a','name']</code>选取索引为'a',列标签为'name'的单个值。<br \><code>df.loc['a':'c','name' ]</code>返回Series对象<br \><code>df.loc['a':'c','id':'name' ]</code>返回DataFrame对象 | ||
|- | |- | ||
| | |布尔数组 | ||
|如[True, False, True]。注意布尔数组长度要与轴标签长度相同,否则会抛出IndexError错误。 | |||
| | |<code><nowiki>s.loc[[True, False, True]]</nowiki></code> 返回s的第1个和第3个值 | ||
| | |<code><nowiki>df.loc[[False, True, True]]</nowiki></code> 返回df的第2行和第3行 | ||
| | |||
|- | |- | ||
| | |callable function | ||
| | |会返回上面的一种索引形式 | ||
| | | | ||
| | | | ||
|- | |- | ||
| | |} | ||
| | |||
| Series. | {{了解更多 | ||
|[https://pandas.pydata.org/docs/user_guide/indexing.html#selection-by-label Pandas 指南:索引与选择数据 - 按标签选择] | |||
| | |[https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html Pandas 参考:DataFrame对象 - DataFrame.loc] | ||
|[https://pandas.pydata.org/docs/reference/api/pandas.Series.loc.html Pandas 参考:Series对象 - Series.loc] | |||
}} | |||
===按位置选择=== | |||
pandas还提供纯粹基于整数位置的索引方法,通过对象调用<code>.iloc</code>属性生成序列对象,然后序列对象调用索引运算符<code>[]</code>。尝试使用非整数,即使有效标签也会引发IndexError。索引是从0开始的整数。切片时,包含起始索引,不包含结束索引。 | |||
{| class="wikitable" style="width: 100%; | |||
|- | |- | ||
! .iloc索引输入值 | |||
! 描述 | |||
! Series示例 | |||
! DataFrame示例 | |||
|- | |- | ||
| | |单个整数 | ||
| | |例如3 | ||
| | |<code>s.iloc[0]</code> 返回s位置索引为0的值,即第一值 | ||
| | |<code>df.iloc[5]</code> 返回df索引为5的行(Series对象),即df的第六行的 | ||
|- | |- | ||
| | |整数列表或数组 | ||
| | |如[0,5](注意:这种方式会有两组方括号<code><nowiki>[[]]</nowiki></code>,里面是生成列表,外面是索引取值操作) | ||
| | |<code><nowiki>s.iloc[[0,5]]</nowiki></code>返回s索引为0和5的值(Series对象) | ||
| | |<code><nowiki>df.iloc[[2,5]]</nowiki></code>返回df索引为2和5的行(DataFrame对象) | ||
|- | |- | ||
| | |带标签的切片对象 | ||
| | |切片如 3:5表示索引3到索引5,步长切片如 0:5:2表示索引0到索引5按步长2选取,还有一些常用示例如:<br \><code>2:</code>从索引2开始到最后<br \><code>:6</code>从最开始到索引6<br \><code>:</code>全部索引 | ||
| | |<code>s.iloc[3:5]</code> 返回s索引3到索引5的值 | ||
| | |<code>df.iloc[3:5]</code> 返回df索引3到索引5的行(DataFrame对象) | ||
|- | |- | ||
| | |行位置索引,列位置索引 | ||
| | |只有DataFrame可用,格式<code>行位置索引,列位置索引</code>,行位置或列位置可以使用切片或数组等。 | ||
| | |− | ||
| | |<code>df.iloc[0, 2]</code>选取第1行第3列的单个值。<br \><code>df.iloc[2:5, 6 ]</code>返回第3行到5行中的第7列(Series对象)<br \><code>df.iloc[2:5, 0:2 ]</code>返回Data第3行到5行中的第1列到第2列(Frame对象) | ||
|- | |- | ||
| | |布尔数组 | ||
| | |如[True, False, True]。注意布尔数组长度要与轴标签长度相同,否则会抛出IndexError错误。 | ||
| | |<code><nowiki>s.iloc[[True, False, True]]</nowiki></code> 返回s的第1个和第3个值 | ||
| | |<code><nowiki>df.iloc[[False, True, True]]</nowiki></code> 返回df的第2行和第3行 | ||
|- | |- | ||
| | |callable function | ||
| | |会返回上面的一种索引形式 | ||
| | | | ||
| | | | ||
|- | |- | ||
| | |} | ||
| | |||
| | {{了解更多 | ||
|[https://pandas.pydata.org/docs/user_guide/indexing.html#selection-by-position Pandas 指南:索引与选择数据 - 按位置选择] | |||
|[https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html Pandas 参考:DataFrame对象 - DataFrame.iloc] | |||
|- | |[https://pandas.pydata.org/docs/reference/api/pandas.Series.iloc.html Pandas 参考:Series对象 - Series.iloc] | ||
}} | |||
===迭代=== | |||
| | {| class="wikitable" | ||
|- | |- | ||
!属性/方法 | |||
!描述 | |||
!Series | |||
!DataFrame | |||
!示例 | |||
|- | |- | ||
| | | __iter__() | ||
| | | Series返回值的迭代器 <br \>DataFrame返回轴的迭代器 | ||
| Series. | | Series.__iter__() | ||
| DataFrame. | | DataFrame.__iter__() | ||
| | | <code>s.__iter__()</code> | ||
|- | |- | ||
| | | items() | ||
| | | Series遍历,返回索引和值的迭代器 <br \>DataFrame按列遍历,返回列标签和列的Series对迭代器。 | ||
| Series. | | Series.items() | ||
| DataFrame. | | DataFrame.items() | ||
| | | <code>s.items()</code> <br \> <code>df.items()</code> <br \> <code>for label, content in df.items():</code> | ||
|- | |- | ||
| | | iteritems() | ||
| | | 返回可迭代的键值对,Series返回索引和值,DataFrame返回列名和列。 | ||
| Series. | |Series.iteritems() | ||
| DataFrame. | |DataFrame.iteritems() | ||
| | | | ||
|- | |- | ||
| | | iterrows() | ||
| | | Iterate over DataFrame rows as (index, Series) pairs. | ||
| | | − | ||
| DataFrame. | |DataFrame.iterrows() | ||
| | | | ||
|- | |- | ||
| | | itertuples() | ||
| | |Iterate over DataFrame rows as namedtuples. | ||
| | | − | ||
| DataFrame. | |DataFrame.itertuples(index=True, name='Pandas') | ||
| | | | ||
|} | |||
==处理数据== | |||
===处理重复数据=== | |||
如果要标识或删除重复的行,可以使用<code>duplicated</code>和<code>drop_duplicates</code>方法。 | |||
{| class="wikitable" style="width: 100%; | |||
! 方法 | |||
! 描述 | |||
! 不同对象的方法 | |||
! 示例 | |||
|- | |- | ||
| | | duplicated | ||
| | | 标识重复行,返回一个布尔值序列。参数:<br \>keep:默认为<code>keep='first'</code>标记第一次出现的重复项为False,其他都为Ture。<code>keep='last'</code>标记最后出现的重复项为False,其他都为Ture。<code>keep=False</code>标记所有重复项为Ture。 | ||
| | |||
| | |||
| | | | ||
|- | |- | ||
| | | drop_duplicates | ||
| | | 删除重复行,返回删除后的对象。参数:<br \>keep:默认为<code>keep='first'</code>保留第一次出现的重复项,其他都删除。<code>keep='last'</code>保留最后出现的重复项,其他都删除。<code>keep=False</code>重复项都删除。 | ||
| Series. | | Series.drop_duplicates(keep='first', inplace=False) <br \><br \>DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) <br \><br \>Index.drop_duplicates(keep='first') | ||
| <code>df.drop_duplicates()</code>删除df中所有列的值都相同的行。<br \><code>df.drop_duplicates(['日期', '品种'])</code>删除df中日期和品种列都相同的行 | |||
| | |} | ||
|} | {{了解更多 | ||
|[https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#duplicate-data Pandas 指南:索引和数据选择 - 重复数据] | |||
|[https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html Pandas 参考:DataFrame.drop_duplicates] | |||
}} | |||
===处理缺失的数据=== | |||
==== | ===数据类型转换=== | ||
{| class="wikitable" | |||
{{了解更多 | |||
|[https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#dtypes Pandas 指南:基础 - dtypes] | |||
|[https://numpy.org/doc/stable/reference/arrays.scalars.html Numpy 参考:标量 ] | |||
|[https://numpy.org/doc/stable/reference/arrays.dtypes.html Numpy 参考:数据类型对象(dtype)] | |||
|[https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.astype.html Pandas 参考:DataFrame.astype] | |||
}} | |||
===处理文本数据=== | |||
====字符串方法==== | |||
Series和Index配备了一组字符串处理方法,这些方法使您可以轻松地对数组的每个元素进行操作。也许最重要的是,这些方法会自动排除丢失/ NA值。这些可以通过str属性访问。 | |||
{| class="wikitable" style="width: 100%; | |||
! 方法 | |||
! 描述 | |||
! 示例 | |||
|- | |- | ||
| Series.str.capitalize(*args, **kwargs) | |||
| Convert strings in the Series/Index to be capitalized. | |||
| | |||
|- | |- | ||
| | | Series.str.casefold(*args, **kwargs) | ||
| Convert strings in the Series/Index to be casefolded. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.cat(*args, **kwargs) | ||
| Concatenate strings in the Series/Index with given separator. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.center(*args, **kwargs) | ||
| Pad left and right side of strings in the Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.contains(*args, **kwargs) | ||
| Test if pattern or regex is contained within a string of a Series or Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.count(*args, **kwargs) | ||
| Count occurrences of pattern in each string of the Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.decode(encoding[, errors]) | ||
| Decode character string in the Series/Index using indicated encoding. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.encode(*args, **kwargs) | ||
| Encode character string in the Series/Index using indicated encoding. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.endswith(*args, **kwargs) | ||
| Test if the end of each string element matches a pattern. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.extract(*args, **kwargs) | ||
| | | Extract capture groups in the regex pat as columns in a DataFrame. | ||
| Series. | | | ||
| DataFrame. | |- | ||
| Series.str.extractall(*args, **kwargs) | |||
| Extract capture groups in the regex pat as columns in DataFrame. | |||
| | | | ||
|- | |- | ||
| | | Series.str.find(*args, **kwargs) | ||
| Return lowest indexes in each strings in the Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.findall(*args, **kwargs) | ||
| Find all occurrences of pattern or regular expression in the Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.get(i) | ||
| Extract element from each component at specified position. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.index(*args, **kwargs) | ||
| Return lowest indexes in each string in Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.join(*args, **kwargs) | ||
| Join lists contained as elements in the Series/Index with passed delimiter. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.len(*args, **kwargs) | ||
| | | Compute the length of each element in the Series/Index. | ||
| | | | ||
|- | |- | ||
| | | Series.str.ljust(*args, **kwargs) | ||
| Pad right side of strings in the Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.lower(*args, **kwargs) | ||
| Convert strings in the Series/Index to lowercase. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.lstrip(*args, **kwargs) | ||
| | | Remove leading characters. | ||
| Series. | | | ||
| | |- | ||
| Series.str.match(*args, **kwargs) | |||
| Determine if each string starts with a match of a regular expression. | |||
| | | | ||
|- | |- | ||
| | | Series.str.normalize(*args, **kwargs) | ||
| Return the Unicode normal form for the strings in the Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.pad(*args, **kwargs) | ||
| Pad strings in the Series/Index up to width. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.partition(*args, **kwargs) | ||
| Split the string at the first occurrence of sep. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.repeat(*args, **kwargs) | ||
| Duplicate each string in the Series or Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.replace(*args, **kwargs) | ||
| | | Replace each occurrence of pattern/regex in the Series/Index. | ||
| Series. | | | ||
| | |- | ||
| Series.str.rfind(*args, **kwargs) | |||
| Return highest indexes in each strings in the Series/Index. | |||
| | | | ||
|- | |- | ||
| | | Series.str.rindex(*args, **kwargs) | ||
| Return highest indexes in each string in Series/Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.rjust(*args, **kwargs) | ||
| | | Pad left side of strings in the Series/Index. | ||
| | | | ||
|- | |- | ||
| Series.str.rpartition(*args, **kwargs) | |||
| Split the string at the last occurrence of sep. | |||
| | |||
|- | |- | ||
| | | Series.str.rstrip(*args, **kwargs) | ||
| Remove trailing characters. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.slice([start, stop, step]) | ||
| Slice substrings from each element in the Series or Index. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.slice_replace(*args, **kwargs) | ||
| Replace a positional slice of a string with another value. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.split(*args, **kwargs) | ||
| Split strings around given separator/delimiter. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.rsplit(*args, **kwargs) | ||
| Split strings around given separator/delimiter. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.startswith(*args, **kwargs) | ||
| Test if the start of each string element matches a pattern. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.strip(*args, **kwargs) | ||
| Remove leading and trailing characters. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.swapcase(*args, **kwargs) | ||
| Convert strings in the Series/Index to be swapcased. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.title(*args, **kwargs) | ||
| Convert | | Convert strings in the Series/Index to titlecase. | ||
| Series. | | | ||
| | |- | ||
| Series.str.translate(*args, **kwargs) | |||
| Map all characters in the string through the given mapping table. | |||
| | | | ||
|- | |- | ||
| | | Series.str.upper(*args, **kwargs) | ||
| Convert strings in the Series/Index to uppercase. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.wrap(*args, **kwargs) | ||
| Wrap strings in Series/Index at specified line width. | |||
| | |||
| | | | ||
|- | |- | ||
| | | Series.str.zfill(*args, **kwargs) | ||
| | | Pad strings in the Series/Index by prepending ‘0’ characters. | ||
| Series. | | | ||
| | |- | ||
| Series.str.isalnum(*args, **kwargs) | |||
| Check whether all characters in each string are alphanumeric. | |||
| | | | ||
|- | |- | ||
| Series.str.isalpha(*args, **kwargs) | |||
| Check whether all characters in each string are alphabetic. | |||
| | |||
|- | |- | ||
| Series.str.isdigit(*args, **kwargs) | |||
| Check whether all characters in each string are digits. | |||
| Series. | | | ||
| | |||
| | |||
|- | |- | ||
| Series.str.isspace(*args, **kwargs) | |||
| Check whether all characters in each string are whitespace. | |||
| Series. | | | ||
| | |||
| | |||
|- | |- | ||
| Series.str.islower(*args, **kwargs) | |||
| Check whether all characters in each string are lowercase. | |||
|Series. | | | ||
| | |||
| | |||
|- | |- | ||
| | | Series.str.isupper(*args, **kwargs) | ||
| Check whether all characters in each string are uppercase. | |||
| | | | ||
| | |||
|- | |- | ||
| Series.str.istitle(*args, **kwargs) | |||
| Check whether all characters in each string are titlecase. | |||
| Series. | | | ||
| | |||
| | |||
|- | |- | ||
| Series.str.isnumeric(*args, **kwargs) | |||
| Check whether all characters in each string are numeric. | |||
|Series. | | | ||
| | |||
| | |||
|- | |- | ||
| Series.str.isdecimal(*args, **kwargs) | |||
| Check whether all characters in each string are decimal. | |||
| Series. | | | ||
| | |||
| | |||
|- | |- | ||
| | | Series.str.get_dummies(*args, **kwargs) | ||
| | | Return DataFrame of dummy/indicator variables for Series. | ||
| | | | ||
| | |} | ||
| | |||
==合并和比较== | |||
===合并=== | |||
{| class="wikitable" style="width: 100%; | |||
! 方法 | |||
! 描述 | |||
! 对象的方法 | |||
! 示例 | |||
|- | |- | ||
| | | concat() | ||
| | | 沿指定轴合并Series或DataFrame。<br \>参数:<br \><code>objs</code>,由Series或DataFrame组成的列表或字典。<br \><code>axis</code>,指定轴{0,1,…},默认为axis=0表示沿行标签合并,axis=1表示沿列标签合并。<br \><code>join</code>, {'inner','outer'},默认'outer'表示沿轴取并集,'inner'沿轴取交集。<br \><code>ignore_index</code>,布尔值,默认为False表示使用轴原来的标签(索引),True表示原来轴标签都不用,使用0开始递增的整数。<br \><code>keys</code>,列表,默认无。使用列表在轴标签(索引)外层再构造一层标签(索引)。 | ||
| | | pandas.concat(<br \> objs, <br \> axis=0, <br \> join='outer', <br \> ignore_index=False, <br \> keys=None, <br \> levels=None, <br \> names=None, <br \> verify_integrity=False, <br \> sort=False, <br \> copy=True<br \>) | ||
| | | <code>pd.concat([df1,df2])</code>沿行标签合并 <br \><code>pd.concat([df1, df4], axis=1)</code>沿列标签合并 <br \><code>pd.concat([df1,df2,df3], keys=["x", "y", "z"])</code>按行标签合并,并再添加一层行标签(由x,y,z组成)。对结果调用loc["y"]可选取df2数据<br \><code>pd.concat([df1, df4], axis=1, join="inner")</code>沿列标签取交集合并 <br \><code>pd.concat([s1, s2, s3], axis=1, keys=["time", "code", "price"])</code> | ||
|- | |- | ||
| | | append() | ||
| | | 加入,Series的append方法用于连接多个Series。DataFrame的append方法用于从其他DataFrame对象加入多行,并返回一个新的DataFrame对象。 | ||
| Series. | | Series.append(to_append, ignore_index=False, verify_integrity=False)<br \><br \>DataFrame.append(other, ignore_index=False, verify_integrity=False, sort=False) | ||
| <code>s1.append(s2)</code>s1后加入s2 <br \><code>df1.append(df2)</code>df1后加入df2,返回加入后的DataFrame对象。<br \><code>df1.append(df2, ignore_index=True)</code> 忽略原来行标签,结果为从0开始递增的整数。 | |||
| <code> | |||
|- | |- | ||
| | | merge() | ||
| 将DataFrame或命名的Series合并,与数据库join操作类似。<br \>参数:<br \><code>left</code>,DataFrame或命名的Series对象。<br \><code>right</code>,另一个DataFrame或命名的Series对象。<br \><code>on</code>,要连接的列或索引级别名称,必须同时在左右对象中找到。 | |||
| pandas.merge(<br \> left, <br \> right, <br \> how='inner', <br \> on=None, <br \> left_on=None, <br \> right_on=None, <br \> left_index=False, <br \> right_index=False, <br \> sort=False, <br \> suffixes=('_x', '_y'), <br \> copy=True, <br \> indicator=False, <br \> validate=None<br \> ) | |||
| | |||
| <code> | |||
|- | |- | ||
| | | join() | ||
| | | 连接另一个DataFrame的多列。 | ||
| DataFrame.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False) | |||
| DataFrame. | | | ||
| | |||
|- | |- | ||
| | | merge_ordered() | ||
| | | | ||
| | | | ||
| | | | ||
|- | |- | ||
| | | merge_asof() | ||
| | |||
| | | | ||
| | | | ||
| | |||
|- | |- | ||
| | | assign() | ||
| | | Assign new columns to a DataFrame. | ||
| DataFrame.assign(**kwargs) | |||
| DataFrame. | | | ||
| | |||
|- | |- | ||
| | | update() | ||
| Modify in place using non-NA values from another DataFrame. | |||
| Series.update(other) <br \>DataFrame.update(other, join='left', overwrite=True, filter_func=None, errors='ignore') | |||
| | |||
| | |||
| | |||
| | |||
|- | |- | ||
| insert() | |||
| 在指定位置插入列。 | |||
| DataFrame.insert(loc, column, value, allow_duplicates=False) | |||
| | |||
| | |||
|} | |} | ||
{{了解更多 | |||
|[https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html pandas 用户指南:合并、加入、连接和比较] | |||
|[https://pandas.pydata.org/docs/reference/frame.html#combining-comparing-joining-merging pandas API:DataFrame 合并/比较/加入/合并] | |||
|[https://pandas.pydata.org/docs/reference/series.html#combining-comparing-joining-merging pandas API:Series 合并/比较/加入/合并] | |||
}} | |||
=== | ===比较=== | ||
{| class="wikitable" | |||
{| class="wikitable" | |||
|- | |- | ||
! 方法 | !属性/方法 | ||
! 描述 | !描述 | ||
! 示例 | !Series | ||
!DataFrame | |||
!示例 | |||
|- | |- | ||
| | | isin() | ||
| | | Whether each element in the Series/DataFrame is contained in values. | ||
| | |Series.isin(values) | ||
|DataFrame.isin(values) | |||
| | |||
|- | |- | ||
| | |equals() | ||
| | |Test whether two objects contain the same elements. | ||
|<code>df. | |Series.equals(other) | ||
|DataFrame.equals(other) | |||
|<code>df.equals(df2)</code> | |||
|} | |||
==计算统计== | |||
===计算/描述统计=== | |||
{| class="wikitable" | |||
|- | |- | ||
!属性/方法 | |||
!描述 | |||
!Series | |||
!DataFrame | |||
!示例 | |||
|- | |- | ||
| | | abs() | ||
| | | 返回 Series/DataFrame 每个元素的绝对值。 | ||
| | | Series.abs() | ||
| DataFrame.abs() | |||
| <code>s.abs()</code> <br \> <code>df.abs()</code> | |||
|- | |- | ||
| | | all() | ||
| | | Return whether all elements are True, potentially over an axis. | ||
| | | Series.all(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | ||
| DataFrame.all(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | |||
| | |||
|- | |- | ||
| | | any() | ||
| | | Return whether any element is True, potentially over an axis. | ||
| | | Series.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | ||
| DataFrame.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | |||
| | |||
|- | |- | ||
| | | clip() | ||
| | | Trim values at input threshold(s). | ||
| Series.clip(lower=None, upper=None, axis=None, inplace=False, *args, **kwargs) | |||
| DataFrame.clip(lower=None, upper=None, axis=None, inplace=False, *args, **kwargs) | |||
| | | | ||
|- | |- | ||
| corr() | |||
| Compute pairwise correlation of columns, excluding NA/null values. | |||
| Series.corr(other, method='pearson', min_periods=None) | |||
| DataFrame.corr(method='pearson', min_periods=1) | |||
| | |||
|- | |- | ||
| | | corrwith() | ||
| | | Compute pairwise correlation. | ||
| | | | ||
| | | DataFrame.corrwith(other, axis=0, drop=False, method='pearson') | ||
| | |||
|- | |- | ||
| | | count() | ||
| | |统计每行或每列值的个数,不包括NA值。 | ||
| Series.count(level=None) | |||
| DataFrame.count(axis=0, level=None, numeric_only=False) | |||
|<code>s.count()</code><br \><code>df.count()</code><br \><code>df.count(axis='columns')</code> | |||
|- | |- | ||
| | | cov() | ||
| | | Compute pairwise covariance of columns, excluding NA/null values. | ||
| | | Series.cov(other, min_periods=None, ddof=1) | ||
| | | DataFrame.cov(min_periods=None, ddof=1) | ||
| | |||
|- | |- | ||
| | | cummax() | ||
| | | Return cumulative maximum over a DataFrame or Series axis. | ||
| | | Series.cummax(axis=None, skipna=True, *args, **kwargs) | ||
| | | DataFrame.cummax(axis=None, skipna=True, *args, **kwargs) | ||
| | |||
|- | |- | ||
| | | cummin() | ||
| | | Return cumulative minimum over a DataFrame or Series axis. | ||
| | | Series.cummin(axis=None, skipna=True, *args, **kwargs) | ||
| | | DataFrame.cummin(axis=None, skipna=True, *args, **kwargs) | ||
| | |||
|- | |- | ||
| | | cumprod() | ||
| | | Return cumulative product over a DataFrame or Series axis. | ||
| Series.cumprod(axis=None, skipna=True, *args, **kwargs) | |||
| DataFrame.cumprod(axis=None, skipna=True, *args, **kwargs) | |||
| | | | ||
|- | |||
| cumsum() | |||
| Return cumulative sum over a DataFrame or Series axis. | |||
| Series.cumsum(axis=None, skipna=True, *args, **kwargs) | |||
| DataFrame.cumsum(axis=None, skipna=True, *args, **kwargs) | |||
| | | | ||
|- | |- | ||
| | | describe() | ||
| Generate descriptive statistics. | |||
| Series.describe(percentiles=None, include=None, exclude=None, datetime_is_numeric=False) | |||
| | | DataFrame.describe(percentiles=None, include=None, exclude=None, datetime_is_numeric=False) | ||
| | | | ||
| | |||
|- | |- | ||
| diff() | |||
| First discrete difference of element. | |||
| Series.diff(periods=1) | |||
| DataFrame.diff(periods=1, axis=0) | |||
| | |||
|- | |- | ||
| | | eval() | ||
| | | Evaluate a string describing operations on DataFrame columns. | ||
| | | | ||
| | | DataFrame.eval(expr, inplace=False, **kwargs) | ||
| | |||
|- | |- | ||
| | | kurt() | ||
| | | Return unbiased kurtosis over requested axis. | ||
| | | Series.kurt(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| | | DataFrame.kurt(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| | |||
|- | |- | ||
| | | kurtosis() | ||
| | | Return unbiased kurtosis over requested axis. | ||
| | | Series.kurtosis(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| | | DataFrame.kurtosis(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| | |||
|- | |- | ||
| | | mad() | ||
| | | Return the mean absolute deviation of the values for the requested axis. | ||
| | | Series.mad(axis=None, skipna=None, level=None) | ||
| | | DataFrame.mad(axis=None, skipna=None, level=None) | ||
| | |||
|- | |- | ||
| | | max() | ||
| | | Return the maximum of the values for the requested axis. | ||
| | | Series.max(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| | | DataFrame.max(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| | |||
|- | |||
| mean() | |||
| Return the mean of the values for the requested axis. | |||
| Series.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| | |||
|- | |- | ||
| | | median() | ||
| | | Return the median of the values for the requested axis. | ||
| Series.median(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| DataFrame.median(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| | | | ||
|- | |||
| min() | |||
| Return the minimum of the values for the requested axis. | |||
| Series.min(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| DataFrame.min(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| | | | ||
|- | |- | ||
| | | mode() | ||
| Get the mode(s) of each element along the selected axis. | |||
| Series.mode(dropna=True) | |||
| | | DataFrame.mode(axis=0, numeric_only=False, dropna=True) | ||
| | | | ||
| | |||
|- | |- | ||
| pct_change() | |||
| Percentage change between the current and a prior element. | |||
| Series.pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs) | |||
| DataFrame.pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs) | |||
| | |||
|- | |- | ||
| | | prod() | ||
| | | Return the product of the values for the requested axis. | ||
| Series. | | Series.prod(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | ||
| DataFrame. | | DataFrame.prod(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | ||
| | | | ||
|- | |- | ||
| | | product() | ||
| | | Return the product of the values for the requested axis. | ||
| Series. | | Series.product(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | ||
| DataFrame. | | DataFrame.product(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | ||
| | | | ||
|- | |- | ||
| | | quantile() | ||
| | | Return values at the given quantile over requested axis. | ||
|Series. | | Series.quantile(q=0.5, interpolation='linear') | ||
|DataFrame. | | DataFrame.quantile(q=0.5, axis=0, numeric_only=True, interpolation='linear') | ||
| | | | ||
|- | |- | ||
| | | rank() | ||
| | | Compute numerical data ranks (1 through n) along axis. | ||
| | | Series.rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) | ||
|DataFrame. | | DataFrame.rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) | ||
| | | | ||
|- | |- | ||
| | | round() | ||
| | | Round a DataFrame to a variable number of decimal places. | ||
| | | Series.round(decimals=0, *args, **kwargs) | ||
|DataFrame. | | DataFrame.round(decimals=0, *args, **kwargs) | ||
| | | | ||
|- | |- | ||
| | | sem() | ||
| | | Return unbiased standard error of the mean over requested axis. | ||
| | | Series.sem(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | ||
| DataFrame.sem(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |||
| | | | ||
|- | |- | ||
| | | skew() | ||
| | | Return unbiased skew over requested axis. | ||
| Series. | | Series.skew(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | ||
| DataFrame.skew(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |||
| | |||
=== | |||
| | | | ||
|- | |- | ||
| Series. | | sum() | ||
| | | Return the sum of the values for the requested axis. | ||
| Series.sum(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | |||
| DataFrame.sum(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | |||
| | | | ||
|- | |- | ||
| Series. | | std() | ||
| | | Return sample standard deviation over requested axis. | ||
| Series.std(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |||
| DataFrame.std(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |||
| | | | ||
|- | |- | ||
| Series. | | var() | ||
| | | Return unbiased variance over requested axis. | ||
| Series.var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |||
| DataFrame.var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |||
| | | | ||
|- | |- | ||
| Series. | | nunique() | ||
| | | Count distinct observations over requested axis. | ||
| Series.nunique(dropna=True) | |||
| DataFrame.nunique(axis=0, dropna=True) | |||
| | | | ||
|- | |- | ||
| Series. | | value_counts() | ||
| | | Return a Series containing counts of unique rows in the DataFrame. | ||
| Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True) | |||
| DataFrame.value_counts(subset=None, normalize=False, sort=True, ascending=False) | |||
| | | | ||
|} | |||
===二元运算功能=== | |||
{| class="wikitable" | |||
|- | |- | ||
!属性/方法 | |||
!描述 | |||
!Series | |||
!DataFrame | |||
!示例 | |||
|- | |- | ||
| Series. | | add() | ||
| | | Get Addition of dataframe and other, element-wise (binary operator add). | ||
| Series.add(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.add(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | sub() | ||
| | | Get Subtraction of dataframe and other, element-wise (binary operator sub). | ||
| Series.sub(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.sub(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | mul() | ||
| | | Get Multiplication of dataframe and other, element-wise (binary operator mul). | ||
| Series.mul(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.mul(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | div() | ||
| | | Get Floating division of dataframe and other, element-wise (binary operator truediv). | ||
| Series.div(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.div(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | truediv() | ||
| | | Get Floating division of dataframe and other, element-wise (binary operator truediv). | ||
| Series.truediv(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.truediv(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | floordiv() | ||
| | | Get Integer division of dataframe and other, element-wise (binary operator floordiv). | ||
| Series.floordiv(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.floordiv(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | mod() | ||
| | | Get Modulo of dataframe and other, element-wise (binary operator mod). | ||
| Series.mod(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.mod(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | pow() | ||
| | | Get Exponential power of dataframe and other, element-wise (binary operator pow). | ||
| Series.pow(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.pow(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | dot() | ||
| | | Compute the matrix multiplication between the DataFrame and other. | ||
| Series.dot(other) | |||
| DataFrame.dot(other) | |||
| | | | ||
|- | |- | ||
| Series. | | radd() | ||
| | | Get Addition of dataframe and other, element-wise (binary operator radd). | ||
| Series.radd(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.radd(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rsub() | ||
| | | Get Subtraction of dataframe and other, element-wise (binary operator rsub). | ||
| Series.rsub(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rsub(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rmul() | ||
| | | Get Multiplication of dataframe and other, element-wise (binary operator rmul). | ||
| Series.rmul(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rmul(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rdiv() | ||
| | | Get Floating division of dataframe and other, element-wise (binary operator rtruediv). | ||
| Series.rdiv(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rdiv(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rtruediv() | ||
| | | Get Floating division of dataframe and other, element-wise (binary operator rtruediv). | ||
| Series.rtruediv(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rtruediv(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rfloordiv() | ||
| | | Get Integer division of dataframe and other, element-wise (binary operator rfloordiv). | ||
| Series.rfloordiv(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rfloordiv(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rmod() | ||
| | | Get Modulo of dataframe and other, element-wise (binary operator rmod). | ||
| Series.rmod(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rmod(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | rpow() | ||
| | | Get Exponential power of dataframe and other, element-wise (binary operator rpow). | ||
| Series.rpow(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.rpow(other, axis='columns', level=None, fill_value=None) | |||
| | | | ||
|- | |- | ||
| Series. | | lt() | ||
| | | Get Less than of dataframe and other, element-wise (binary operator lt). | ||
| Series.lt(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.lt(other, axis='columns', level=None) | |||
| | | | ||
|- | |- | ||
| Series. | | gt() | ||
| | | Get Greater than of dataframe and other, element-wise (binary operator gt). | ||
| Series.gt(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.gt(other, axis='columns', level=None) | |||
| | | | ||
|- | |- | ||
| Series. | | le() | ||
| | | Get Less than or equal to of dataframe and other, element-wise (binary operator le). | ||
| Series.le(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.le(other, axis='columns', level=None) | |||
| | | | ||
|- | |- | ||
| Series. | | ge() | ||
| | | Get Greater than or equal to of dataframe and other, element-wise (binary operator ge). | ||
| Series.ge(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.ge(other, axis='columns', level=None) | |||
| | | | ||
|- | |- | ||
| Series. | | ne() | ||
| | | Get Not equal to of dataframe and other, element-wise (binary operator ne). | ||
| Series.ne(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.ne(other, axis='columns', level=None) | |||
| | | | ||
|- | |- | ||
| Series. | | eq() | ||
| | | Get Equal to of dataframe and other, element-wise (binary operator eq). | ||
| Series.eq(other, level=None, fill_value=None, axis=0) | |||
| DataFrame.eq(other, axis='columns', level=None) | |||
| | | | ||
|- | |- | ||
| Series. | | combine() | ||
| | | Perform column-wise combine with another DataFrame. | ||
| Series.combine(other, func, fill_value=None) | |||
| DataFrame.combine(other, func, fill_value=None, overwrite=True) | |||
| | | | ||
|- | |- | ||
| | | combine_first() | ||
| | | Update null elements with value in the same location in other. | ||
| Series.combine_first(other) | |||
| DataFrame.combine_first(other) | |||
| | | | ||
|} | |||
==GroupBy分组== | |||
===创建GroupBy对象=== | |||
{| class="wikitable" style="width: 100%; | |||
|- | |- | ||
! 类名 | |||
! 创建对象方法 | |||
! 完整参数 | |||
! 示例 | |||
|- | |- | ||
| Series. | | SeriesGroupBy | ||
| | | [https://pandas.pydata.org/docs/reference/api/pandas.Series.groupby.html#pandas.Series.groupby Series.groupby()] | ||
| | | Series.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=<object object>, observed=False, dropna=True) | ||
| | |||
|- | |||
| DataFrameGroupBy | |||
| [https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html#pandas.DataFrame.groupby DataFrame.groupby()] | |||
| DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=<object object>, observed=False, dropna=True) | |||
| <code>df.groupby('code')</code>或<code>df.groupby(by='code')</code>按code列分组,创建一个GroupBy对象 | |||
|- | |||
|} | |||
===GroupBy属性与方法=== | |||
====选取与迭代==== | |||
{| class="wikitable" style="width: 100%; | |||
|- | |- | ||
!属性/方法 | |||
!描述 | |||
!示例 | |||
|- | |- | ||
| | | GroupBy.__iter__() | ||
| | | Groupby迭代器 | ||
| | | | ||
|- | |- | ||
| | | GroupBy.groups | ||
| | | Dict{组名->组数据} | ||
| | | for name, group in grouped:<br \> print(name)<br \> print(group ) | ||
|- | |- | ||
| | | GroupBy.indices | ||
| | | Dict{组名->组索引} | ||
| | | | ||
|- | |- | ||
| | | GroupBy.get_group(name, obj=None) | ||
| | | 通过组名选取一个组,返回DataFrame格式。 | ||
| | | grouped.get_group('AAPL') | ||
|- | |- | ||
| | | pandas.Grouper(*args, **kwargs) | ||
| | | x.describe() | ||
| | | | ||
|- | |- | ||
| | |} | ||
====功能应用==== | |||
| | {| class="wikitable" | ||
|- | |- | ||
!属性/方法 | |||
!描述 | |||
!Series | |||
!DataFrame | |||
!示例 | |||
|- | |- | ||
| | |GroupBy.apply() | ||
| | |应用,按组应用函数func,并将结果组合在一起。 | ||
| | |GroupBy.apply(func,* args,** kwargs) | ||
|GroupBy.apply(func,* args,** kwargs) | |||
|grouped['C'].apply(lambda x: x.describe()) | |||
|- | |- | ||
| | |GroupBy.agg() | ||
| | |聚合,等效aggregate() | ||
|GroupBy.agg(func,* args,** kwargs) | |||
|GroupBy.agg(func,* args,** kwargs) | |||
| | | | ||
|- | |- | ||
| | |aggregate() | ||
| | |聚合,在指定轴上使用一项或多项操作进行汇总。 | ||
|SeriesGroupBy.aggregate(func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |||
|DataFrameGroupBy.aggregate(func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |||
| | | | ||
|- | |- | ||
| | |transform() | ||
| | |转换,按组调用函数,并将原始数据替换为转换后的结果 | ||
|[https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.SeriesGroupBy.transform.html#pandas.core.groupby.SeriesGroupBy.transform SeriesGroupBy.transform](func, *args, engine=None, engine_kwargs=None, **kwargs) | |||
|[https://pandas.pydata.org/docs/reference/api/pandas.core.groupby.DataFrameGroupBy.transform.html#pandas.core.groupby.DataFrameGroupBy.transform DataFrameGroupBy.transform](func, *args, engine=None, engine_kwargs=None, **kwargs) | |||
| | | | ||
|- | |- | ||
| | |GroupBy.pipe() | ||
| | |将带有参数的函数func应用于GroupBy对象,并返回函数的结果。 | ||
|GroupBy.pipe(func,* args,** kwargs) | |||
|GroupBy.pipe(func,* args,** kwargs) | |||
| | | | ||
|- | |- | ||
| | |} | ||
====计算/描述统计==== | |||
| | {| class="wikitable sortable" | ||
|- | |- | ||
!属性/方法 | |||
!描述 | |||
!Series | |||
!DataFrame | |||
!示例 | |||
|- | |- | ||
| | | GroupBy.all() | ||
| | | Return True if all values in the group are truthful, else False. | ||
| GroupBy.all(skipna=True) | |||
| DataFrameGroupBy.all(skipna=True) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.any() | ||
| | | Return True if any value in the group is truthful, else False. | ||
| GroupBy.any(skipna=True) | |||
| DataFrameGroupBy.any(skipna=True) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.backfill() | ||
| | | Backward fill the values. | ||
| | | GroupBy.backfill(limit=None) | ||
| DataFrameGroupBy.backfill(limit=None) | |||
| | |||
|- | |- | ||
| | | GroupBy.bfill() | ||
| | | 同 GroupBy.backfill() | ||
| GroupBy.bfill(limit=None) | |||
| DataFrameGroupBy.bfill(limit=None) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.count() | ||
| | | 统计每组值的个数,不包含缺失值。 | ||
| | | GroupBy.count() | ||
| | | DataFrameGroupBy.count() | ||
| grouped.count() | |||
|- | |- | ||
| | | GroupBy.cumcount() | ||
| | | Number each item in each group from 0 to the length of that group - 1. | ||
| | | GroupBy.cumcount(ascending=True) | ||
| | | DataFrameGroupBy.cumcount(ascending=True) | ||
| | |||
|- | |- | ||
| | | GroupBy.cummax() | ||
| | | Cumulative max for each group. | ||
| | | GroupBy.cummax(axis=0, **kwargs) | ||
| | | DataFrameGroupBy.cummax(axis=0, **kwargs) | ||
| | |||
|- | |- | ||
| | | GroupBy.cummin() | ||
| | | Cumulative min for each group. | ||
| | | GroupBy.cummin(axis=0, **kwargs) | ||
| DataFrameGroupBy.cummin(axis=0, **kwargs) | |||
| | |||
|- | |||
| GroupBy.cumprod() | |||
| Cumulative product for each group. | |||
| GroupBy.cumprod(axis=0, *args, **kwargs) | |||
| DataFrameGroupBy.cumprod(axis=0, *args, **kwargs) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.cumsum() | ||
| | | Cumulative sum for each group. | ||
| | | GroupBy.cumsum(axis=0, *args, **kwargs) | ||
| DataFrameGroupBy.cumsum(axis=0, *args, **kwargs) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.ffill() | ||
| Forward fill the values. | |||
| GroupBy.ffill(limit=None) | |||
| DataFrameGroupBy.ffill(limit=None) | |||
| | | | ||
|- | |||
| GroupBy.first() | |||
| Compute first of group values. | |||
| colspan="2" |GroupBy.first(numeric_only=False, min_count=- 1) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.head() | ||
| | | 返回每组的前n行,默认5行 | ||
| | | colspan="2" | GroupBy.head(n=5) | ||
| | |||
|- | |||
| GroupBy.last() | |||
| Compute last of group values. | |||
| colspan="2" | GroupBy.last(numeric_only=False, min_count=- 1) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.max() | ||
| | | Compute max of group values. | ||
| | | colspan="2" | GroupBy.max(numeric_only=False, min_count=- 1) | ||
| | | | ||
|- | |- | ||
| | | GroupBy.mean() | ||
| | | Compute mean of groups, excluding missing values. | ||
| | | colspan="2" | GroupBy.mean(numeric_only=True) | ||
| | | | ||
|- | |||
| GroupBy.median() | |||
| Compute median of groups, excluding missing values. | |||
| colspan="2" | GroupBy.median(numeric_only=True) | |||
| | | | ||
|- | |- | ||
| GroupBy.min([numeric_only, min_count]) | |||
| Compute min of group values. | |||
| colspan="2" | GroupBy.min(numeric_only=False, min_count=- 1) | |||
| | |||
|- | |- | ||
| | | GroupBy.ngroup([ascending]) | ||
| | | Number each group from 0 to the number of groups - 1. | ||
| | | colspan="2" | GroupBy.ngroup(ascending=True) | ||
| | | | ||
|- | |- | ||
| | | GroupBy.nth(n[, dropna]) | ||
| 如果参数n是一个整数,则取每个组的第n行;如果n是一个整数列表,则取每组行的子集。 | |||
| | | colspan="2" | GroupBy.nth(n, dropna=None) | ||
| | | | ||
|- | |- | ||
| | | GroupBy.ohlc() | ||
| 计算组的开始值,最高值,最低值和末尾值,不包括缺失值。 | |||
| colspan="2" | GroupBy.ohlc() | |||
| | |||
|- | |- | ||
| GroupBy.pad() | |||
| Forward fill the values. | |||
| GroupBy.pad(limit=None) | |||
|DataFrameGroupBy.pad(limit=None) | |||
| | |||
|- | |- | ||
| GroupBy. | | GroupBy.prod([numeric_only, min_count]) | ||
| | | Compute prod of group values. | ||
| colspan="2" | GroupBy.prod(numeric_only=True, min_count=0) | |||
| | | | ||
|- | |- | ||
| GroupBy. | | GroupBy.rank([method, ascending, na_option, …]) | ||
| | | Provide the rank of values within each group. | ||
| | | GroupBy.rank(method='average', ascending=True, na_option='keep', pct=False, axis=0) | ||
| DataFrameGroupBy.rank(method='average', ascending=True, na_option='keep', pct=False, axis=0) | |||
| | |||
|- | |- | ||
| GroupBy. | | GroupBy.pct_change([periods, fill_method, …]) | ||
| | | Calculate pct_change of each value to previous entry in group. | ||
| GroupBy.pct_change(periods=1, fill_method='pad', limit=None, freq=None, axis=0) | |||
| DataFrameGroupBy.pct_change(periods=1, fill_method='pad', limit=None, freq=None, axis=0) | |||
| | | | ||
|- | |- | ||
| GroupBy. | | GroupBy.size() | ||
| | | Compute group sizes. | ||
| | | GroupBy.size() | ||
| DataFrameGroupBy.size() | |||
| | |||
|- | |- | ||
| | | GroupBy.sem() | ||
| | | Compute standard error of the mean of groups, excluding missing values. | ||
| colspan="2" | GroupBy.sem(ddof=1) | |||
| | | | ||
|- | |- | ||
| | | GroupBy.std() | ||
| Compute standard deviation of groups, excluding missing values. | |||
| colspan="2" | GroupBy.std(ddof=1) | |||
| | |||
|- | |- | ||
| GroupBy.sum([numeric_only, min_count]) | |||
| Compute sum of group values. | |||
| colspan="2" | GroupBy.sum(numeric_only=True, min_count=0) | |||
| | |||
|- | |- | ||
|GroupBy. | | GroupBy.var([ddof]) | ||
| Compute variance of groups, excluding missing values. | |||
| colspan="2" | GroupBy.var(ddof=1) | |||
| | |||
| | |||
|GroupBy. | |||
| | | | ||
|- | |- | ||
| | | GroupBy.tail() | ||
| | | 返回每组的最后n行,默认5行 | ||
| | | colspan="2" | GroupBy.tail(n=5) | ||
| | |||
| | | | ||
|} | |||
{{了解更多 | |||
|[https://pandas.pydata.org/docs/user_guide/groupby.html Pandas 用户指南:Group by: split-apply-combine] | |||
|[https://pandas.pydata.org/docs/reference/groupby.html Pandas 参考:GroupBy] | |||
}} | |||
==时间序列== | |||
===概览=== | |||
Pandas把时间相关分为4种概念,用8个类来表示。 | |||
{| class="wikitable" | |||
|- | |- | ||
! 概念 | |||
! 描述 | |||
! 标量类 | |||
! 数组类 | |||
! pandas数据类型 | |||
! 主要创建方法 | |||
! 示例 | |||
|- | |- | ||
| | | 日期时间 | ||
| | | 支持时区的特定日期时间点。<br \>类似Python标准库的datetime.datetime。 | ||
| | | Timestamp | ||
| | | DatetimeIndex | ||
| datetime64[ns] <br \>或 datetime64[ns, tz] | |||
| to_datetime() <br \>date_range() | |||
| <code>pd.to_datetime('2020-01-01')</code>生成:Timestamp('2020-01-01 00:00:00') | |||
|- | |||
| 时间增量 | |||
| 持续时间,即两个日期或时间的差值。<br \>类似Python标准库的datetime.timedelta。 | |||
| Timedelta | |||
| TimedeltaIndex | |||
| timedelta64[ns] | |||
| to_timedelta() <br \>timedelta_range() | |||
| | | | ||
|- | |- | ||
| | | 时间跨度 | ||
| 由时间点及其关联的频率定义的时间跨度。 | |||
| Period | |||
| PeriodIndex | |||
| period[freq] | |||
| Period() <br \>period_range() | |||
| | |||
|- | |- | ||
| 日期偏移 | |||
| 日期增量 | |||
| DateOffset | |||
| None | |||
| None | |||
| | | DateOffset() | ||
| | |||
| | |||
| | |||
| | |||
| | | | ||
|} | |||
{{了解更多 | |||
|[https://pandas.pydata.org/docs/user_guide/timeseries.html pandas 用户指南:时间序列] | |||
}} | |||
===日期时间属性=== | |||
以下是Timestamp类和DatetimeIndex类的一些属性或方法。 | |||
{| class="wikitable" | |||
|- | |- | ||
! 属性 | |||
! 描述 | |||
! 示例 | |||
|- | |- | ||
| | | year | ||
| | | 年 | ||
| | | | ||
|- | |- | ||
| | | month | ||
| | | 月 | ||
| | |||
| | |||
|- | |- | ||
| | | day | ||
| | | 日 | ||
| | | | ||
|- | |- | ||
| | | hour | ||
| | | 小时 | ||
| | | | ||
| | |- | ||
| minute | |||
| 分钟 | |||
| | | | ||
|- | |- | ||
| | | second | ||
| | | 秒 | ||
| | |||
| | |||
|- | |- | ||
| | | microsecond | ||
| | | 微秒 | ||
| | |||
| | |||
|- | |- | ||
| | | nanosecond | ||
| | | 纳秒 | ||
| | |||
| | |||
|- | |- | ||
| | | date | ||
| | | 日期(不包含时区信息) | ||
| | | | ||
|- | |- | ||
| | | time | ||
| | | 时间(不包含时区信息) | ||
| | | | ||
|- | |- | ||
| | | timetz() | ||
| | | 时间(包含本地时区信息) | ||
| | |||
| | |||
|- | |- | ||
| | | day_of_year / dayofyear | ||
| | | 一年里的第几天 | ||
| | |||
| | |||
|- | |- | ||
| | | week / weekofyear | ||
| | | 一年里的第几周 | ||
| | |||
| | |||
|- | |- | ||
| | | day_of_week / dayofweek / weekday | ||
| | | 一周里的第几天,Monday(星期一)=0,Sunday(星期天)=6 | ||
| | |||
| | |||
|- | |- | ||
| | | quarter | ||
| | | 日期所处的季度,如(1月、2月、3月)=1,(4月、5月、6月)=2 | ||
| | |||
| | |||
|- | |- | ||
| | | days_in_month | ||
| | | 日期所在的月有多少天 | ||
| | |||
| | |||
|- | |- | ||
| | | is_month_start | ||
| | | 是否月初(由频率定义) | ||
| | |||
| | |||
|- | |- | ||
| | | is_month_end | ||
| | | 是否月末(由频率定义) | ||
| | | | ||
|- | |- | ||
| | | is_quarter_start | ||
| | | 是否季初(由频率定义) | ||
| | |||
| | |||
|- | |- | ||
| | | is_quarter_end | ||
| | | 是否季末(由频率定义) | ||
| | |||
| | |||
|- | |- | ||
| | | is_year_start | ||
| | | 是否年初(由频率定义) | ||
| | |||
| | |||
|- | |- | ||
| | | is_year_end | ||
| | | 是否年末(由频率定义) | ||
| | |||
| | |||
|- | |- | ||
| | | is_leap_year | ||
| | | 是否闰年 | ||
| | | | ||
|} | |||
{{了解更多 | |||
|[https://pandas.pydata.org/docs/user_guide/timeseries.html#time-date-components pandas 用户指南:时间序列 Time/date components] | |||
}} | |||
===日期偏移=== | |||
DateOffset对象用来处理日期偏移。 | |||
{| class="wikitable" | |||
|- | |- | ||
! 日期偏移量 | |||
! 频率字符串 | |||
! 描述 | |||
! 示例 | |||
|- | |- | ||
| | | DateOffset | ||
| | | 无 | ||
| | | 通用偏移类,默认为24小时 | ||
| | | | ||
|- | |- | ||
| | | Day | ||
| | | 'D' | ||
| | | 一天 | ||
| | | | ||
|- | |- | ||
| | | Hour | ||
| | | 'H' | ||
| | | 一小时 | ||
| | | | ||
|- | |||
| Minute | |||
| 'T' 或 'min' | |||
| 一分钟 | |||
| | |||
|- | |- | ||
| | | Second | ||
| | | 'S' | ||
| | | 一秒 | ||
| | | | ||
|- | |- | ||
| | | Milli | ||
| | | 'L' 或 'ms' | ||
| | | 一毫秒 | ||
| | |||
|- | |- | ||
| Micro | |||
| 'U' 或 'us' | |||
| 一微秒 | |||
| | |||
|- | |- | ||
| | | Nano | ||
| | | 'N' | ||
| 一纳秒 | |||
| | |||
| | |||
| | |||
|- | |- | ||
| | | BDay 或 BusinessDay | ||
| | | 'B' | ||
| | | 工作日 | ||
| | |||
| | |||
|- | |- | ||
| | | CDay 或 CustomBusinessDay | ||
| | | 'C' | ||
| | | 自定义工作日 | ||
| | |||
| | |||
|- | |- | ||
| | | Week | ||
| | | 'W' | ||
| | | 一周,可选锚定周几 | ||
| | | | ||
|- | |- | ||
| WeekOfMonth | |||
| 'WOM' | |||
| 每月第几周的第几天 | |||
| | |||
| | |||
| | |||
| | | | ||
|- | |- | ||
| | | LastWeekOfMonth | ||
| | | 'LWOM' | ||
| 每月最后一周的第几天 | |||
| | | | ||
|- | |- | ||
| | | MonthEnd | ||
| | | 'M' | ||
| 日历月末 | |||
| | | | ||
|- | |- | ||
| | | MonthBegin | ||
| | | 'MS' | ||
| 日历月初 | |||
| | | | ||
|- | |- | ||
| | | BMonthEnd 或 BusinessMonthEnd | ||
| | | 'BM' | ||
| 工作日月末 | |||
| | | | ||
|- | |- | ||
| | | BMonthBegin 或 BusinessMonthBegin | ||
| | | 'BMS' | ||
| 工作日月初 | |||
| | | | ||
|- | |- | ||
| | | CBMonthEnd 或 CustomBusinessMonthEnd | ||
| | | 'CBM' | ||
| 自定义工作日月末 | |||
| | | | ||
|- | |- | ||
| | | CBMonthBegin 或 CustomBusinessMonthBegin | ||
| | | 'CBMS' | ||
| 自定义工作日月初 | |||
| | | | ||
|- | |- | ||
| | | SemiMonthEnd | ||
| | | 'SM' | ||
| 月第15天(或其他天数)与日历月末 | |||
| | | | ||
|- | |- | ||
| | | SemiMonthBegin | ||
| | | 'SMS' | ||
| 日历月初与月第15天(或其他天数) | |||
| | | | ||
|- | |- | ||
| | | QuarterEnd | ||
| | | 'Q' | ||
| 日历季末 | |||
| | | | ||
|- | |- | ||
| | | QuarterBegin | ||
| | | 'QS' | ||
| 日历季初 | |||
| | | | ||
|- | |- | ||
| | | BQuarterEnd | ||
| | | 'BQ | ||
| 工作季末 | |||
| | | | ||
|- | |- | ||
| | | BQuarterBegin | ||
| | | 'BQS' | ||
| 工作季初 | |||
| | | | ||
|- | |- | ||
| | | FY5253Quarter | ||
| | | 'REQ' | ||
| 零售(又名 52-53 周)季 | |||
| | | | ||
|- | |- | ||
| | | YearEnd | ||
| | | 'A' | ||
| 日历年末 | |||
| | | | ||
|- | |- | ||
| | | YearBegin | ||
| | | 'AS' 或 'BYS' | ||
| 日历年初 | |||
| | | | ||
|- | |- | ||
| | | BYearEnd | ||
| | | 'BA' | ||
| 工作日年末 | |||
| | | | ||
|- | |- | ||
| | | BYearBegin | ||
| | | 'BAS' | ||
| 工作日年初 | |||
| | | | ||
|- | |- | ||
| | | FY5253 | ||
| | | 'RE' | ||
| 零售(又名 52-53 周)年 | |||
| | | | ||
|- | |- | ||
| | | Easter | ||
| | | 无 | ||
| 复活节假日 | |||
| | | | ||
|- | |- | ||
| | | BusinessHour | ||
| | | 'BH' | ||
| 工作小时 | |||
| | | | ||
|- | |- | ||
| | | CustomBusinessHour | ||
| | | 'CBH' | ||
| 自定义工作小时 | |||
| | | | ||
|} | |} | ||
=== | ===时间序列相关=== | ||
{| class="wikitable" | |||
{| class="wikitable" | |||
|- | |- | ||
! | !属性/方法 | ||
! | !描述 | ||
! | !Series | ||
! 示例 | !DataFrame | ||
!示例 | |||
|- | |- | ||
| | | asfreq() | ||
| | | Convert TimeSeries to specified frequency. | ||
| | | Series.asfreq(freq, method=None, how=None, normalize=False, fill_value=None) | ||
| | | DataFrame.asfreq(freq, method=None, how=None, normalize=False, fill_value=None) | ||
| | |||
|- | |- | ||
| | | asof() | ||
| | | Return the last row(s) without any NaNs before where. | ||
| | | Series.asof(where, subset=None) | ||
| | | DataFrame.asof(where, subset=None) | ||
| | |||
|- | |- | ||
| | | shift() | ||
| | | Shift index by desired number of periods with an optional time freq. | ||
| | | Series.shift(periods=1, freq=None, axis=0, fill_value=None) | ||
| | | DataFrame.shift(periods=1, freq=None, axis=0, fill_value=None) | ||
| | |||
|- | |- | ||
| | | slice_shift() | ||
| | | Equivalent to shift without copying data. | ||
| | | Series.slice_shift(periods=1, axis=0) | ||
| | | DataFrame.slice_shift(periods=1, axis=0) | ||
| | |||
|- | |- | ||
| | | tshift() | ||
| | | (DEPRECATED) Shift the time index, using the index’s frequency if available. | ||
| | | Series.tshift(periods=1, freq=None, axis=0) | ||
| | | DataFrame.tshift(periods=1, freq=None, axis=0) | ||
| | |||
|- | |- | ||
| | | first_valid_index() | ||
| | | Return index for first non-NA/null value. | ||
| | | Series.first_valid_index() | ||
| | | DataFrame.first_valid_index() | ||
| | |||
|- | |- | ||
| | | last_valid_index() | ||
| | | Return index for last non-NA/null value. | ||
| | | Series.last_valid_index() | ||
| | | DataFrame.last_valid_index() | ||
| | |||
|- | |- | ||
| | | resample() | ||
| ' | | Resample time-series data. | ||
| | | Series.resample(rule, axis=0, closed=None, label=None, convention='start', kind=None, loffset=None, base=None, on=None, level=None, origin='start_day', offset=None) | ||
| | | DataFrame.resample(rule, axis=0, closed=None, label=None, convention='start', kind=None, loffset=None, base=None, on=None, level=None, origin='start_day', offset=None) | ||
| | |||
|- | |- | ||
| | | to_period() | ||
| | | Convert DataFrame from DatetimeIndex to PeriodIndex. | ||
| | | Series.to_period(freq=None, copy=True) | ||
| | | DataFrame.to_period(freq=None, axis=0, copy=True) | ||
| | |||
|- | |- | ||
| | | to_timestamp() | ||
| ' | | Cast to DatetimeIndex of timestamps, at beginning of period. | ||
| | | Series.to_timestamp(freq=None, how='start', copy=True) | ||
| | | DataFrame.to_timestamp(freq=None, how='start', axis=0, copy=True) | ||
| | |||
|- | |- | ||
| | | tz_convert() | ||
| | | Convert tz-aware axis to target time zone. | ||
| | | Series.tz_convert(tz, axis=0, level=None, copy=True) | ||
| | | DataFrame.tz_convert(tz, axis=0, level=None, copy=True) | ||
| | |||
|- | |- | ||
| | | tz_localize() | ||
| Localize tz-naive index of a Series or DataFrame to target time zone. | |||
| Series.tz_localize(tz, axis=0, level=None, copy=True, ambiguous='raise', nonexistent='raise') | |||
| DataFrame.tz_localize(tz, axis=0, level=None, copy=True, ambiguous='raise', nonexistent='raise') | |||
| | |||
|- | |||
| | |||
| | |||
| | |||
|} | |} | ||
2021年7月21日 (三) 15:37的版本
Pandas是一个Python语言的开源软件库,用于数据分析,可以方便对数据进行处理、计算、分析、存储及可视化。
简介
时间轴
- 2008年,开发者Wes McKinney在AQR Capital Management开始制作pandas来满足在财务数据上进行定量分析对高性能、灵活工具的需要。在离开AQR之前他说服管理者允许他将这个库开放源代码。
- 2012年,另一个AQR雇员Chang She加入了这项努力并成为这个库的第二个主要贡献者。
- 2015年,Pandas签约了NumFOCUS的一个财务赞助项目,它是美国的501(c)(3)非营利慈善团体。
安装和导入
使用pip安装Pandas
pip install pandas
如果使用的是Anaconda等计算科学软件包,已经安装好了pandas库。
导入Pandas,在脚本顶部导入,一般写法如下:
import pandas as pd
查看Pandas版本:
pd.__version__
数据结构
pandas定义了2种数据类型,Series和DataFrame,大部分操作都在这两种数据类型上进行。
了解更多 >> Pandas 用户指南:数据结构
Series
Series是一个有轴标签(索引)的一维数组,能够保存任何数据类型(整数,字符串,浮点数,Python对象等)。轴标签称为index
。和Python字典类似。
创建Series的基本方法为,使用pandas.Series类新建一个Series对象,格式如下:
pandas.Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
轴标签index不是必须,如果省略,轴标签默认为从0开始的整数数组。一些示例如下:
s = pd.Series(["foo", "bar", "foba"])
print(type(s)) #<class 'pandas.core.series.Series'>
s2 = pd.Series(["foo", "bar", "foba"], index=['b','d','c'])
# 创建日期索引
date_index = pd.date_range("2020-01-01", periods=3, freq="D")
s3 = pd.Series(["foo", "bar", "foba"], index=date_index)
了解更多 >> Pandas 用户指南:Series Pandas API:Series
DataFrame
DataFrame是有标记的二维的数据结构,具有可能不同类型的列。由数据,行标签(索引,index),列标签(列,columns)构成。类似电子表格或SQL表或Series对象的字典。它通常是最常用的Pandas对象。
创建DataFrame对象有多种方法:
- 使用
pandas.DataFrame()
构造方法 - 使用
pandas.DataFrame.from_dict()
方法,类似构造方法 - 使用
pandas.DataFrame.from_records()
方法,类似构造方法 - 使用函数从导入文件创建,如使用
pandas.read_csv()
函数导入csv文件创建一个DataFrame对象。
构造方法pandas.DataFrame()
的格式为:
pandas.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
示例:
df = pd.DataFrame([['foo', 22], ['bar', 25], ['test', 18]],columns=['name', 'age'])
查看数据
表示例中s为一个Series对象,df为一个DataFrame对象:
>>> s = pd.Series(['a', 'b', 'c'])
>>> s
0 a
1 b
2 c
dtype: object
>>> df = pd.DataFrame([['foo', 22], ['bar', 25], ['test', 18]],columns=['name', 'age'])
>>> df
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
head() | 返回前n行数据,默认前5行 | Series.head(n=5) | DataFrame.head(n=5) | df.head() 返回df前5行数据df.head(10) 返回df前10行数据。
|
tail() | 返回最后n行数据,默认最后5行 | Series.tail(n=5) | DataFrame.tail(n=5) | df.tail() 返回df最后5行数据df.tail(10) 返回df最后10行数据。
|
dtypes | 返回数据的Numpy数据类型(dtype对象) | Series.index | DataFrame.index | s.dtypes df.dtypes
|
dtype | 返回数据的Numpy数据类型(dtype对象) | Series.index | − | s.dtype
|
array | 返回 Series 或 Index 数据的数组,该数组为pangdas扩展的python数组. | Series.index | − | s.array 返回:<PandasArray> ['a', 'b', 'c'] Length: 3, dtype: object |
attrs | 此对象全局属性字典。 | Series.attrs | DataFrame.attrs | s.attrs 返回{}
|
hasnans | 如果有任何空值(如Python的None,np.NaN)返回True,否则返回False。 | Series.hasnans | − | s.hasnans 返回False |
values | 返回ndarray(NumPy的多维数组)或类似ndarray的形式。 | Series.values | DataFrame.values | s.values 返回array(['a', 'b', 'c'], dtype=object)
|
ndim | 返回数据的维数,Series返回1,DataFrame返回2 | Series.ndim | DataFrame.ndim | s.ndim 返回1 df.ndim 返回2
|
size | 返回数据中元素的个数 | Series.size | DataFrame.size | s.size 返回3 df.ndim 返回6
|
shape | 返回数据形状(行数和列数)的元组 | Series.shape | DataFrame.shape | s.shape 返回(3, ) df.shape 返回(3, 2)
|
empty | 返回是否为空,为空返回Ture | Series.empty | DataFrame.empty | s.empty 返回False df.empty 返回False
|
name | 返回Series的名称。 | Series.name | − | s.name 返回空
|
memory_usage() | 返回Series或DataFrame的内存使用情况,单位Bytes。参数index默认为True,表示包含index。 参数deep默认为False,表示不通过查询dtypes对象来深入了解数据的系统级内存使用情况 |
Series.memory_usage(index=True, deep=False) | DataFrame.memory_usage(index=True, deep=False) | s.memory_usage() 返回空152 df.memory_usage(index=False)
|
info() | 打印DataFrame的简要信息。 | − | DataFrame.info(verbose=True, buf=None, max_cols=None, memory_usage=True, null_counts=True) | df.info()
|
select_dtypes() | 根据列的dtypes返回符合条件的DataFrame子集 | − | DataFrame.select_dtypes(include=None, exclude=None) | df.select_dtypes(include=['float64'])
|
索引
查看索引
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
index | 索引(行标签),可以查看和设置 | Series.index | DataFrame.index | s.index 返回RangeIndex(start=0, stop=3, step=1) df.index
|
columns | 列标签,Series无,可以查看和设置 | − | DataFrame.columns | df.columns
|
keys() | 列标签,没有就返回索引 | Series.keys() | DataFrame.keys() | df.keys() 返回列标签
|
axes | 返回轴标签(行标签和列标签)的列表。 Series返回[index] DataFrame返回[index, columns] |
Series.axes | DataFrame.axes | s.axes 返回[RangeIndex(start=0, stop=3, step=1)] df.axes 返回索引和列名。
|
idxmax() | 返回第一次出现最大值的索引位置。 | Series.idxmax(axis=0, skipna=True, *args, **kwargs) | DataFrame.idxmax(axis=0, skipna=True) | df.idxmax()
|
idxmin() | 返回第一次出现最小值的索引位置。 | Series.idxmin(axis=0, skipna=True, *args, **kwargs) | DataFrame.idxmin(axis=0, skipna=True) | s.idxmin()
|
设置与重置索引
Series对象和DataFrame对象可以通过.index
或.columns
属性设置,还可以通过以下方法来设置与重置。
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
set_index() | 将某列设置为索引 | − | DataFrame.set_index(keys, drop=True, append=False, inplace=False, verify_integrity=False) | df.set_index('col_3') 将‘col_3’列设置为索引。
|
reset_index() | 重置索引,默认从0开始整数。参数:drop 是否删除原索引,默认不删除 level 重置多索引的一个或多个级别。
|
Series.reset_index(level=None, drop=False, name=None, inplace=False) | DataFrame.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill=) | |
reindex() | 用Series或DataFrame匹配新索引。对于新索引有旧索引无的默认使用NaN填充,新索引无旧索引有的删除。 | Series.reindex(index=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) | DataFrame.reindex(labels=None, index=None, columns=None, axis=None, method=None, copy=True, level=None, fill_value=nan, limit=None, tolerance=None) | |
reindex_like() | Return an object with matching indices as other object. | Series.reindex_like(other, method=None, copy=True, limit=None, tolerance=None) | DataFrame.reindex_like(other, method=None, copy=True, limit=None, tolerance=None) | |
rename() | 修改轴(索引或列)标签。 | Series.rename(index=None, *, axis=None, copy=True, inplace=False, level=None, errors='ignore') | DataFrame.rename(mapper=None, index=None, columns=None, axis=None, copy=True, inplace=False, level=None, errors='ignore') | |
rename_axis() | Set the name of the axis for the index or columns. | Series.rename_axis(**kwargs) | DataFrame.rename_axis(**kwargs) | |
set_axis() | Assign desired index to given axis. | Series.set_axis(labels, axis=0, inplace=False) | DataFrame.set_axis(labels, axis=0, inplace=False) | df.set_axis(['a', 'b', 'c'], axis='index') df.set_axis(['I', 'II'], axis='columns')
|
add_prefix() | 索引或列标签添加前缀 | Series.add_prefix(prefix) | DataFrame.add_prefix(prefix) | s.add_prefix('item_') df.add_prefix('col_')
|
add_suffix() | 索引或列标签添加后缀 | Series.add_suffix(suffix) | DataFrame.add_suffix(suffix) |
数据选取与迭代
概览
方法 | 描述 | 示例 |
---|---|---|
索引运算符 [ ]
|
Python中序列对象使用self[key] 是在调用对象的特殊方法__getitem__() 。Python运算符[ ] 有3种通用序列操作:self[i] 取第i项(起始为0)self[i:j] 从 i 到 j 的切片self[i:j:k] s 从 i 到 j 步长为 k 的切片 Pandas支持NumPy扩展的一些操作: self[布尔索引] ,如s[s>5]
|
s[1] 取s的第二个值df[1:-1] 切片,返回df第二行到倒数第二行组成的DataFrame对象
|
属性运算符.
|
同Python字典属性获取 | df.a 返回df的名称为a的列
|
按标签选择 loc[ ]
|
通过对象调用.loc 属性生成序列对象,序列对象调用索引运算符[] 。
|
df.loc[2] 选取索引(行标签)值为2的行 df.loc[1:2] 选取索引值为1到2的行 df.loc[[1,2]] 选取索引值为1和2的行 df.loc[1,'name'] 选取行标签值为1,列标签值为'name'的单个值df.loc[[1:2],'name'] 选取行标签值为1到2,列标签值为'name'的数据
|
按位置选择 iloc[ ]
|
纯粹基于整数位置的索引方法,通过对象调用.iloc 属性生成序列对象,然后序列对象调用索引运算符[] 。
|
s.iloc[2] 选取行标签位置为2的行 s.iloc[:2] 选取索引为0到2(不包含2)的值 s.iloc[[True,False,True]] 选取索引位置为True的值 s.iloc[lambda x: x.index % 2 == 0] 选取索引为双数的值
|
按标签选择单个 at[ ]
|
通过行轴和列轴标签对获取或设置单个值。 | s.at[1] 返回'b's.at[2]='d' 设置索引位置为第三的值等于'd' df.at[2, 'name']' 获取index=2,columns='name'点的值
|
按位置选择单个 iat[ ]
|
通过行轴和列轴整数位置获取或设置单个值。 | s.iat[1] s.iat[2]='d'
|
查询方法 query()
|
DataFrame对象query()方法,使用表达式进行选择。DataFrame.query(expr, inplace=False, **kwargs)
|
df.query('A > B') 相当于df[df.A > df.B]
|
通过行列标签筛选 filter()
|
通过行列标签筛选 Series.filter(items=None, like=None, regex=None, axis=None) DataFrame.filter(items=None, like=None, regex=None, axis=None)
|
df.filter(like='bbi', axis=0) 选取行标签包含'bbi'的行。
|
多索引选择 xs()
|
只能用于选择数据,不能设置值。可以使用iloc[ ] 或loc[ ] 替换。Series.xs(key, axis=0, level=None, drop_level=True) DataFrame.xs(key, axis=0, level=None, drop_level=True)
|
df.xs('a', level=1) |
选择一列 get() |
选择某一列 Series.get(key, default=None) DataFrame.get(key, default=None)
|
df.get('a') 返回a列
|
选择指定标签列并删除 pop()
|
返回某一列,并从数据中删除,如果列名没找到抛出KeyError。Series.pop(item) DataFrame.pop(item)
|
df.pop('a') 返回a列并从df中删除。
|
删除指定标签列 drop()
|
返回删除指定标签列后的数据 Series.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise') DataFrame.drop(labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')
|
|
抽样 sample()
|
返回抽样数据 Series.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None) DataFrame.sample(n=None, frac=None, replace=False, weights=None, random_state=None, axis=None)
|
了解更多 >> Pandas 指南:索引与选择数据 Python 3 文档:序列类型 - 通用序列操作 Python 3 文档:数据模型 - 特殊方法名称 NumPy 文档:初学者基础知识 - 索引和切片
按标签选择
pandas提供基于标签的索引方法,通过对象调用.loc
属性生成序列对象,序列对象调用索引运算符[]
。该方法严格要求,每个标签都必须在索引中,否则会抛出KeyError错误。切片时,如果索引中存在起始边界和终止边界,则都将包括在内。整数是有效的标签,但它们引用的是标签,而不是位置(索引顺序)。
.loc索引输入值 | 描述 | Series示例 | DataFrame示例 |
---|---|---|---|
单个标签 | 例如5或'a'(注意,5被解释为索引的标签,而不是整数位置。) | s.loc['a'] 返回s索引为'a'的值
|
df.loc['b'] 返回df索引(行标签)为'b'的行(Series对象)
|
标签列表或标签数组 | 如['a', 'c'](注意:这种方式会有两组方括号[[]] ,里面是生成列表,外面是索引取值操作)
|
s.loc[['a', 'c']] 返回s索引为'a'和'c'的值(Series对象)
|
df.loc[['a', 'c']] 返回df索引(行标签)为'a'和'c'的行(DataFrame对象)
|
带标签的切片对象 | 切片如 'a':'f'表示标签'a'到标签'f',步长切片如 'a':'f':2表示标签'a'到标签'f'按步长2选取(注意:和Python切片不同,这里包含开始标签和结束标签),还有一些常用示例如:'f': 从标签'f'开始到最后:'f' 从最开始到标签'f': 全部标签
|
s.loc[a:c] 返回s索引'a'到'c'的值
|
df.loc[b:f] 返回df索引(行标签)'b'到'f'的行(DataFrame对象)
|
行标签,列标签 | 只有DataFrame可用,格式行标签,列标签 ,行标签或列标签可以使用切片或数组等。
|
− | df.loc['a','name'] 选取索引为'a',列标签为'name'的单个值。df.loc['a':'c','name' ] 返回Series对象df.loc['a':'c','id':'name' ] 返回DataFrame对象
|
布尔数组 | 如[True, False, True]。注意布尔数组长度要与轴标签长度相同,否则会抛出IndexError错误。 | s.loc[[True, False, True]] 返回s的第1个和第3个值
|
df.loc[[False, True, True]] 返回df的第2行和第3行
|
callable function | 会返回上面的一种索引形式 |
了解更多 >> Pandas 指南:索引与选择数据 - 按标签选择 Pandas 参考:DataFrame对象 - DataFrame.loc Pandas 参考:Series对象 - Series.loc
按位置选择
pandas还提供纯粹基于整数位置的索引方法,通过对象调用.iloc
属性生成序列对象,然后序列对象调用索引运算符[]
。尝试使用非整数,即使有效标签也会引发IndexError。索引是从0开始的整数。切片时,包含起始索引,不包含结束索引。
.iloc索引输入值 | 描述 | Series示例 | DataFrame示例 |
---|---|---|---|
单个整数 | 例如3 | s.iloc[0] 返回s位置索引为0的值,即第一值
|
df.iloc[5] 返回df索引为5的行(Series对象),即df的第六行的
|
整数列表或数组 | 如[0,5](注意:这种方式会有两组方括号[[]] ,里面是生成列表,外面是索引取值操作)
|
s.iloc[[0,5]] 返回s索引为0和5的值(Series对象)
|
df.iloc[[2,5]] 返回df索引为2和5的行(DataFrame对象)
|
带标签的切片对象 | 切片如 3:5表示索引3到索引5,步长切片如 0:5:2表示索引0到索引5按步长2选取,还有一些常用示例如:2: 从索引2开始到最后:6 从最开始到索引6: 全部索引
|
s.iloc[3:5] 返回s索引3到索引5的值
|
df.iloc[3:5] 返回df索引3到索引5的行(DataFrame对象)
|
行位置索引,列位置索引 | 只有DataFrame可用,格式行位置索引,列位置索引 ,行位置或列位置可以使用切片或数组等。
|
− | df.iloc[0, 2] 选取第1行第3列的单个值。df.iloc[2:5, 6 ] 返回第3行到5行中的第7列(Series对象)df.iloc[2:5, 0:2 ] 返回Data第3行到5行中的第1列到第2列(Frame对象)
|
布尔数组 | 如[True, False, True]。注意布尔数组长度要与轴标签长度相同,否则会抛出IndexError错误。 | s.iloc[[True, False, True]] 返回s的第1个和第3个值
|
df.iloc[[False, True, True]] 返回df的第2行和第3行
|
callable function | 会返回上面的一种索引形式 |
了解更多 >> Pandas 指南:索引与选择数据 - 按位置选择 Pandas 参考:DataFrame对象 - DataFrame.iloc Pandas 参考:Series对象 - Series.iloc
迭代
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
__iter__() | Series返回值的迭代器 DataFrame返回轴的迭代器 |
Series.__iter__() | DataFrame.__iter__() | s.__iter__()
|
items() | Series遍历,返回索引和值的迭代器 DataFrame按列遍历,返回列标签和列的Series对迭代器。 |
Series.items() | DataFrame.items() | s.items() df.items() for label, content in df.items():
|
iteritems() | 返回可迭代的键值对,Series返回索引和值,DataFrame返回列名和列。 | Series.iteritems() | DataFrame.iteritems() | |
iterrows() | Iterate over DataFrame rows as (index, Series) pairs. | − | DataFrame.iterrows() | |
itertuples() | Iterate over DataFrame rows as namedtuples. | − | DataFrame.itertuples(index=True, name='Pandas') |
处理数据
处理重复数据
如果要标识或删除重复的行,可以使用duplicated
和drop_duplicates
方法。
方法 | 描述 | 不同对象的方法 | 示例 |
---|---|---|---|
duplicated | 标识重复行,返回一个布尔值序列。参数: keep:默认为 keep='first' 标记第一次出现的重复项为False,其他都为Ture。keep='last' 标记最后出现的重复项为False,其他都为Ture。keep=False 标记所有重复项为Ture。
|
||
drop_duplicates | 删除重复行,返回删除后的对象。参数: keep:默认为 keep='first' 保留第一次出现的重复项,其他都删除。keep='last' 保留最后出现的重复项,其他都删除。keep=False 重复项都删除。
|
Series.drop_duplicates(keep='first', inplace=False) DataFrame.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False) Index.drop_duplicates(keep='first') |
df.drop_duplicates() 删除df中所有列的值都相同的行。df.drop_duplicates(['日期', '品种']) 删除df中日期和品种列都相同的行
|
处理缺失的数据
数据类型转换
处理文本数据
字符串方法
Series和Index配备了一组字符串处理方法,这些方法使您可以轻松地对数组的每个元素进行操作。也许最重要的是,这些方法会自动排除丢失/ NA值。这些可以通过str属性访问。
方法 | 描述 | 示例 |
---|---|---|
Series.str.capitalize(*args, **kwargs) | Convert strings in the Series/Index to be capitalized. | |
Series.str.casefold(*args, **kwargs) | Convert strings in the Series/Index to be casefolded. | |
Series.str.cat(*args, **kwargs) | Concatenate strings in the Series/Index with given separator. | |
Series.str.center(*args, **kwargs) | Pad left and right side of strings in the Series/Index. | |
Series.str.contains(*args, **kwargs) | Test if pattern or regex is contained within a string of a Series or Index. | |
Series.str.count(*args, **kwargs) | Count occurrences of pattern in each string of the Series/Index. | |
Series.str.decode(encoding[, errors]) | Decode character string in the Series/Index using indicated encoding. | |
Series.str.encode(*args, **kwargs) | Encode character string in the Series/Index using indicated encoding. | |
Series.str.endswith(*args, **kwargs) | Test if the end of each string element matches a pattern. | |
Series.str.extract(*args, **kwargs) | Extract capture groups in the regex pat as columns in a DataFrame. | |
Series.str.extractall(*args, **kwargs) | Extract capture groups in the regex pat as columns in DataFrame. | |
Series.str.find(*args, **kwargs) | Return lowest indexes in each strings in the Series/Index. | |
Series.str.findall(*args, **kwargs) | Find all occurrences of pattern or regular expression in the Series/Index. | |
Series.str.get(i) | Extract element from each component at specified position. | |
Series.str.index(*args, **kwargs) | Return lowest indexes in each string in Series/Index. | |
Series.str.join(*args, **kwargs) | Join lists contained as elements in the Series/Index with passed delimiter. | |
Series.str.len(*args, **kwargs) | Compute the length of each element in the Series/Index. | |
Series.str.ljust(*args, **kwargs) | Pad right side of strings in the Series/Index. | |
Series.str.lower(*args, **kwargs) | Convert strings in the Series/Index to lowercase. | |
Series.str.lstrip(*args, **kwargs) | Remove leading characters. | |
Series.str.match(*args, **kwargs) | Determine if each string starts with a match of a regular expression. | |
Series.str.normalize(*args, **kwargs) | Return the Unicode normal form for the strings in the Series/Index. | |
Series.str.pad(*args, **kwargs) | Pad strings in the Series/Index up to width. | |
Series.str.partition(*args, **kwargs) | Split the string at the first occurrence of sep. | |
Series.str.repeat(*args, **kwargs) | Duplicate each string in the Series or Index. | |
Series.str.replace(*args, **kwargs) | Replace each occurrence of pattern/regex in the Series/Index. | |
Series.str.rfind(*args, **kwargs) | Return highest indexes in each strings in the Series/Index. | |
Series.str.rindex(*args, **kwargs) | Return highest indexes in each string in Series/Index. | |
Series.str.rjust(*args, **kwargs) | Pad left side of strings in the Series/Index. | |
Series.str.rpartition(*args, **kwargs) | Split the string at the last occurrence of sep. | |
Series.str.rstrip(*args, **kwargs) | Remove trailing characters. | |
Series.str.slice([start, stop, step]) | Slice substrings from each element in the Series or Index. | |
Series.str.slice_replace(*args, **kwargs) | Replace a positional slice of a string with another value. | |
Series.str.split(*args, **kwargs) | Split strings around given separator/delimiter. | |
Series.str.rsplit(*args, **kwargs) | Split strings around given separator/delimiter. | |
Series.str.startswith(*args, **kwargs) | Test if the start of each string element matches a pattern. | |
Series.str.strip(*args, **kwargs) | Remove leading and trailing characters. | |
Series.str.swapcase(*args, **kwargs) | Convert strings in the Series/Index to be swapcased. | |
Series.str.title(*args, **kwargs) | Convert strings in the Series/Index to titlecase. | |
Series.str.translate(*args, **kwargs) | Map all characters in the string through the given mapping table. | |
Series.str.upper(*args, **kwargs) | Convert strings in the Series/Index to uppercase. | |
Series.str.wrap(*args, **kwargs) | Wrap strings in Series/Index at specified line width. | |
Series.str.zfill(*args, **kwargs) | Pad strings in the Series/Index by prepending ‘0’ characters. | |
Series.str.isalnum(*args, **kwargs) | Check whether all characters in each string are alphanumeric. | |
Series.str.isalpha(*args, **kwargs) | Check whether all characters in each string are alphabetic. | |
Series.str.isdigit(*args, **kwargs) | Check whether all characters in each string are digits. | |
Series.str.isspace(*args, **kwargs) | Check whether all characters in each string are whitespace. | |
Series.str.islower(*args, **kwargs) | Check whether all characters in each string are lowercase. | |
Series.str.isupper(*args, **kwargs) | Check whether all characters in each string are uppercase. | |
Series.str.istitle(*args, **kwargs) | Check whether all characters in each string are titlecase. | |
Series.str.isnumeric(*args, **kwargs) | Check whether all characters in each string are numeric. | |
Series.str.isdecimal(*args, **kwargs) | Check whether all characters in each string are decimal. | |
Series.str.get_dummies(*args, **kwargs) | Return DataFrame of dummy/indicator variables for Series. |
合并和比较
合并
方法 | 描述 | 对象的方法 | 示例 | |
---|---|---|---|---|
concat() | 沿指定轴合并Series或DataFrame。 参数: objs ,由Series或DataFrame组成的列表或字典。axis ,指定轴{0,1,…},默认为axis=0表示沿行标签合并,axis=1表示沿列标签合并。join , {'inner','outer'},默认'outer'表示沿轴取并集,'inner'沿轴取交集。ignore_index ,布尔值,默认为False表示使用轴原来的标签(索引),True表示原来轴标签都不用,使用0开始递增的整数。keys ,列表,默认无。使用列表在轴标签(索引)外层再构造一层标签(索引)。
|
pandas.concat( objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True ) |
pd.concat([df1,df2]) 沿行标签合并 pd.concat([df1, df4], axis=1) 沿列标签合并 pd.concat([df1,df2,df3], keys=["x", "y", "z"]) 按行标签合并,并再添加一层行标签(由x,y,z组成)。对结果调用loc["y"]可选取df2数据pd.concat([df1, df4], axis=1, join="inner") 沿列标签取交集合并 pd.concat([s1, s2, s3], axis=1, keys=["time", "code", "price"])
| |
append() | 加入,Series的append方法用于连接多个Series。DataFrame的append方法用于从其他DataFrame对象加入多行,并返回一个新的DataFrame对象。 | Series.append(to_append, ignore_index=False, verify_integrity=False) DataFrame.append(other, ignore_index=False, verify_integrity=False, sort=False) |
s1.append(s2) s1后加入s2 df1.append(df2) df1后加入df2,返回加入后的DataFrame对象。df1.append(df2, ignore_index=True) 忽略原来行标签,结果为从0开始递增的整数。
| |
merge() | 将DataFrame或命名的Series合并,与数据库join操作类似。 参数: left ,DataFrame或命名的Series对象。right ,另一个DataFrame或命名的Series对象。on ,要连接的列或索引级别名称,必须同时在左右对象中找到。
|
pandas.merge( left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None ) |
||
join() | 连接另一个DataFrame的多列。 | DataFrame.join(other, on=None, how='left', lsuffix=, rsuffix=, sort=False) | ||
merge_ordered() | ||||
merge_asof() | ||||
assign() | Assign new columns to a DataFrame. | DataFrame.assign(**kwargs) | ||
update() | Modify in place using non-NA values from another DataFrame. | Series.update(other) DataFrame.update(other, join='left', overwrite=True, filter_func=None, errors='ignore') |
||
insert() | 在指定位置插入列。 | DataFrame.insert(loc, column, value, allow_duplicates=False) |
比较
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
isin() | Whether each element in the Series/DataFrame is contained in values. | Series.isin(values) | DataFrame.isin(values) | |
equals() | Test whether two objects contain the same elements. | Series.equals(other) | DataFrame.equals(other) | df.equals(df2)
|
计算统计
计算/描述统计
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
abs() | 返回 Series/DataFrame 每个元素的绝对值。 | Series.abs() | DataFrame.abs() | s.abs() df.abs()
|
all() | Return whether all elements are True, potentially over an axis. | Series.all(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | DataFrame.all(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | |
any() | Return whether any element is True, potentially over an axis. | Series.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | DataFrame.any(axis=0, bool_only=None, skipna=True, level=None, **kwargs) | |
clip() | Trim values at input threshold(s). | Series.clip(lower=None, upper=None, axis=None, inplace=False, *args, **kwargs) | DataFrame.clip(lower=None, upper=None, axis=None, inplace=False, *args, **kwargs) | |
corr() | Compute pairwise correlation of columns, excluding NA/null values. | Series.corr(other, method='pearson', min_periods=None) | DataFrame.corr(method='pearson', min_periods=1) | |
corrwith() | Compute pairwise correlation. | DataFrame.corrwith(other, axis=0, drop=False, method='pearson') | ||
count() | 统计每行或每列值的个数,不包括NA值。 | Series.count(level=None) | DataFrame.count(axis=0, level=None, numeric_only=False) | s.count() df.count() df.count(axis='columns')
|
cov() | Compute pairwise covariance of columns, excluding NA/null values. | Series.cov(other, min_periods=None, ddof=1) | DataFrame.cov(min_periods=None, ddof=1) | |
cummax() | Return cumulative maximum over a DataFrame or Series axis. | Series.cummax(axis=None, skipna=True, *args, **kwargs) | DataFrame.cummax(axis=None, skipna=True, *args, **kwargs) | |
cummin() | Return cumulative minimum over a DataFrame or Series axis. | Series.cummin(axis=None, skipna=True, *args, **kwargs) | DataFrame.cummin(axis=None, skipna=True, *args, **kwargs) | |
cumprod() | Return cumulative product over a DataFrame or Series axis. | Series.cumprod(axis=None, skipna=True, *args, **kwargs) | DataFrame.cumprod(axis=None, skipna=True, *args, **kwargs) | |
cumsum() | Return cumulative sum over a DataFrame or Series axis. | Series.cumsum(axis=None, skipna=True, *args, **kwargs) | DataFrame.cumsum(axis=None, skipna=True, *args, **kwargs) | |
describe() | Generate descriptive statistics. | Series.describe(percentiles=None, include=None, exclude=None, datetime_is_numeric=False) | DataFrame.describe(percentiles=None, include=None, exclude=None, datetime_is_numeric=False) | |
diff() | First discrete difference of element. | Series.diff(periods=1) | DataFrame.diff(periods=1, axis=0) | |
eval() | Evaluate a string describing operations on DataFrame columns. | DataFrame.eval(expr, inplace=False, **kwargs) | ||
kurt() | Return unbiased kurtosis over requested axis. | Series.kurt(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.kurt(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
kurtosis() | Return unbiased kurtosis over requested axis. | Series.kurtosis(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.kurtosis(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
mad() | Return the mean absolute deviation of the values for the requested axis. | Series.mad(axis=None, skipna=None, level=None) | DataFrame.mad(axis=None, skipna=None, level=None) | |
max() | Return the maximum of the values for the requested axis. | Series.max(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.max(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
mean() | Return the mean of the values for the requested axis. | Series.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.mean(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
median() | Return the median of the values for the requested axis. | Series.median(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.median(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
min() | Return the minimum of the values for the requested axis. | Series.min(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.min(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
mode() | Get the mode(s) of each element along the selected axis. | Series.mode(dropna=True) | DataFrame.mode(axis=0, numeric_only=False, dropna=True) | |
pct_change() | Percentage change between the current and a prior element. | Series.pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs) | DataFrame.pct_change(periods=1, fill_method='pad', limit=None, freq=None, **kwargs) | |
prod() | Return the product of the values for the requested axis. | Series.prod(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | DataFrame.prod(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | |
product() | Return the product of the values for the requested axis. | Series.product(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | DataFrame.product(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | |
quantile() | Return values at the given quantile over requested axis. | Series.quantile(q=0.5, interpolation='linear') | DataFrame.quantile(q=0.5, axis=0, numeric_only=True, interpolation='linear') | |
rank() | Compute numerical data ranks (1 through n) along axis. | Series.rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) | DataFrame.rank(axis=0, method='average', numeric_only=None, na_option='keep', ascending=True, pct=False) | |
round() | Round a DataFrame to a variable number of decimal places. | Series.round(decimals=0, *args, **kwargs) | DataFrame.round(decimals=0, *args, **kwargs) | |
sem() | Return unbiased standard error of the mean over requested axis. | Series.sem(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | DataFrame.sem(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |
skew() | Return unbiased skew over requested axis. | Series.skew(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | DataFrame.skew(axis=None, skipna=None, level=None, numeric_only=None, **kwargs) | |
sum() | Return the sum of the values for the requested axis. | Series.sum(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | DataFrame.sum(axis=None, skipna=None, level=None, numeric_only=None, min_count=0, **kwargs) | |
std() | Return sample standard deviation over requested axis. | Series.std(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | DataFrame.std(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |
var() | Return unbiased variance over requested axis. | Series.var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | DataFrame.var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None, **kwargs) | |
nunique() | Count distinct observations over requested axis. | Series.nunique(dropna=True) | DataFrame.nunique(axis=0, dropna=True) | |
value_counts() | Return a Series containing counts of unique rows in the DataFrame. | Series.value_counts(normalize=False, sort=True, ascending=False, bins=None, dropna=True) | DataFrame.value_counts(subset=None, normalize=False, sort=True, ascending=False) |
二元运算功能
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
add() | Get Addition of dataframe and other, element-wise (binary operator add). | Series.add(other, level=None, fill_value=None, axis=0) | DataFrame.add(other, axis='columns', level=None, fill_value=None) | |
sub() | Get Subtraction of dataframe and other, element-wise (binary operator sub). | Series.sub(other, level=None, fill_value=None, axis=0) | DataFrame.sub(other, axis='columns', level=None, fill_value=None) | |
mul() | Get Multiplication of dataframe and other, element-wise (binary operator mul). | Series.mul(other, level=None, fill_value=None, axis=0) | DataFrame.mul(other, axis='columns', level=None, fill_value=None) | |
div() | Get Floating division of dataframe and other, element-wise (binary operator truediv). | Series.div(other, level=None, fill_value=None, axis=0) | DataFrame.div(other, axis='columns', level=None, fill_value=None) | |
truediv() | Get Floating division of dataframe and other, element-wise (binary operator truediv). | Series.truediv(other, level=None, fill_value=None, axis=0) | DataFrame.truediv(other, axis='columns', level=None, fill_value=None) | |
floordiv() | Get Integer division of dataframe and other, element-wise (binary operator floordiv). | Series.floordiv(other, level=None, fill_value=None, axis=0) | DataFrame.floordiv(other, axis='columns', level=None, fill_value=None) | |
mod() | Get Modulo of dataframe and other, element-wise (binary operator mod). | Series.mod(other, level=None, fill_value=None, axis=0) | DataFrame.mod(other, axis='columns', level=None, fill_value=None) | |
pow() | Get Exponential power of dataframe and other, element-wise (binary operator pow). | Series.pow(other, level=None, fill_value=None, axis=0) | DataFrame.pow(other, axis='columns', level=None, fill_value=None) | |
dot() | Compute the matrix multiplication between the DataFrame and other. | Series.dot(other) | DataFrame.dot(other) | |
radd() | Get Addition of dataframe and other, element-wise (binary operator radd). | Series.radd(other, level=None, fill_value=None, axis=0) | DataFrame.radd(other, axis='columns', level=None, fill_value=None) | |
rsub() | Get Subtraction of dataframe and other, element-wise (binary operator rsub). | Series.rsub(other, level=None, fill_value=None, axis=0) | DataFrame.rsub(other, axis='columns', level=None, fill_value=None) | |
rmul() | Get Multiplication of dataframe and other, element-wise (binary operator rmul). | Series.rmul(other, level=None, fill_value=None, axis=0) | DataFrame.rmul(other, axis='columns', level=None, fill_value=None) | |
rdiv() | Get Floating division of dataframe and other, element-wise (binary operator rtruediv). | Series.rdiv(other, level=None, fill_value=None, axis=0) | DataFrame.rdiv(other, axis='columns', level=None, fill_value=None) | |
rtruediv() | Get Floating division of dataframe and other, element-wise (binary operator rtruediv). | Series.rtruediv(other, level=None, fill_value=None, axis=0) | DataFrame.rtruediv(other, axis='columns', level=None, fill_value=None) | |
rfloordiv() | Get Integer division of dataframe and other, element-wise (binary operator rfloordiv). | Series.rfloordiv(other, level=None, fill_value=None, axis=0) | DataFrame.rfloordiv(other, axis='columns', level=None, fill_value=None) | |
rmod() | Get Modulo of dataframe and other, element-wise (binary operator rmod). | Series.rmod(other, level=None, fill_value=None, axis=0) | DataFrame.rmod(other, axis='columns', level=None, fill_value=None) | |
rpow() | Get Exponential power of dataframe and other, element-wise (binary operator rpow). | Series.rpow(other, level=None, fill_value=None, axis=0) | DataFrame.rpow(other, axis='columns', level=None, fill_value=None) | |
lt() | Get Less than of dataframe and other, element-wise (binary operator lt). | Series.lt(other, level=None, fill_value=None, axis=0) | DataFrame.lt(other, axis='columns', level=None) | |
gt() | Get Greater than of dataframe and other, element-wise (binary operator gt). | Series.gt(other, level=None, fill_value=None, axis=0) | DataFrame.gt(other, axis='columns', level=None) | |
le() | Get Less than or equal to of dataframe and other, element-wise (binary operator le). | Series.le(other, level=None, fill_value=None, axis=0) | DataFrame.le(other, axis='columns', level=None) | |
ge() | Get Greater than or equal to of dataframe and other, element-wise (binary operator ge). | Series.ge(other, level=None, fill_value=None, axis=0) | DataFrame.ge(other, axis='columns', level=None) | |
ne() | Get Not equal to of dataframe and other, element-wise (binary operator ne). | Series.ne(other, level=None, fill_value=None, axis=0) | DataFrame.ne(other, axis='columns', level=None) | |
eq() | Get Equal to of dataframe and other, element-wise (binary operator eq). | Series.eq(other, level=None, fill_value=None, axis=0) | DataFrame.eq(other, axis='columns', level=None) | |
combine() | Perform column-wise combine with another DataFrame. | Series.combine(other, func, fill_value=None) | DataFrame.combine(other, func, fill_value=None, overwrite=True) | |
combine_first() | Update null elements with value in the same location in other. | Series.combine_first(other) | DataFrame.combine_first(other) |
GroupBy分组
创建GroupBy对象
类名 | 创建对象方法 | 完整参数 | 示例 |
---|---|---|---|
SeriesGroupBy | Series.groupby() | Series.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=<object object>, observed=False, dropna=True) | |
DataFrameGroupBy | DataFrame.groupby() | DataFrame.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=<object object>, observed=False, dropna=True) | df.groupby('code') 或df.groupby(by='code') 按code列分组,创建一个GroupBy对象
|
GroupBy属性与方法
选取与迭代
属性/方法 | 描述 | 示例 |
---|---|---|
GroupBy.__iter__() | Groupby迭代器 | |
GroupBy.groups | Dict{组名->组数据} | for name, group in grouped: print(name) print(group ) |
GroupBy.indices | Dict{组名->组索引} | |
GroupBy.get_group(name, obj=None) | 通过组名选取一个组,返回DataFrame格式。 | grouped.get_group('AAPL') |
pandas.Grouper(*args, **kwargs) | x.describe() |
功能应用
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
GroupBy.apply() | 应用,按组应用函数func,并将结果组合在一起。 | GroupBy.apply(func,* args,** kwargs) | GroupBy.apply(func,* args,** kwargs) | grouped['C'].apply(lambda x: x.describe()) |
GroupBy.agg() | 聚合,等效aggregate() | GroupBy.agg(func,* args,** kwargs) | GroupBy.agg(func,* args,** kwargs) | |
aggregate() | 聚合,在指定轴上使用一项或多项操作进行汇总。 | SeriesGroupBy.aggregate(func=None, *args, engine=None, engine_kwargs=None, **kwargs) | DataFrameGroupBy.aggregate(func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
transform() | 转换,按组调用函数,并将原始数据替换为转换后的结果 | SeriesGroupBy.transform(func, *args, engine=None, engine_kwargs=None, **kwargs) | DataFrameGroupBy.transform(func, *args, engine=None, engine_kwargs=None, **kwargs) | |
GroupBy.pipe() | 将带有参数的函数func应用于GroupBy对象,并返回函数的结果。 | GroupBy.pipe(func,* args,** kwargs) | GroupBy.pipe(func,* args,** kwargs) |
计算/描述统计
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
GroupBy.all() | Return True if all values in the group are truthful, else False. | GroupBy.all(skipna=True) | DataFrameGroupBy.all(skipna=True) | |
GroupBy.any() | Return True if any value in the group is truthful, else False. | GroupBy.any(skipna=True) | DataFrameGroupBy.any(skipna=True) | |
GroupBy.backfill() | Backward fill the values. | GroupBy.backfill(limit=None) | DataFrameGroupBy.backfill(limit=None) | |
GroupBy.bfill() | 同 GroupBy.backfill() | GroupBy.bfill(limit=None) | DataFrameGroupBy.bfill(limit=None) | |
GroupBy.count() | 统计每组值的个数,不包含缺失值。 | GroupBy.count() | DataFrameGroupBy.count() | grouped.count() |
GroupBy.cumcount() | Number each item in each group from 0 to the length of that group - 1. | GroupBy.cumcount(ascending=True) | DataFrameGroupBy.cumcount(ascending=True) | |
GroupBy.cummax() | Cumulative max for each group. | GroupBy.cummax(axis=0, **kwargs) | DataFrameGroupBy.cummax(axis=0, **kwargs) | |
GroupBy.cummin() | Cumulative min for each group. | GroupBy.cummin(axis=0, **kwargs) | DataFrameGroupBy.cummin(axis=0, **kwargs) | |
GroupBy.cumprod() | Cumulative product for each group. | GroupBy.cumprod(axis=0, *args, **kwargs) | DataFrameGroupBy.cumprod(axis=0, *args, **kwargs) | |
GroupBy.cumsum() | Cumulative sum for each group. | GroupBy.cumsum(axis=0, *args, **kwargs) | DataFrameGroupBy.cumsum(axis=0, *args, **kwargs) | |
GroupBy.ffill() | Forward fill the values. | GroupBy.ffill(limit=None) | DataFrameGroupBy.ffill(limit=None) | |
GroupBy.first() | Compute first of group values. | GroupBy.first(numeric_only=False, min_count=- 1) | ||
GroupBy.head() | 返回每组的前n行,默认5行 | GroupBy.head(n=5) | ||
GroupBy.last() | Compute last of group values. | GroupBy.last(numeric_only=False, min_count=- 1) | ||
GroupBy.max() | Compute max of group values. | GroupBy.max(numeric_only=False, min_count=- 1) | ||
GroupBy.mean() | Compute mean of groups, excluding missing values. | GroupBy.mean(numeric_only=True) | ||
GroupBy.median() | Compute median of groups, excluding missing values. | GroupBy.median(numeric_only=True) | ||
GroupBy.min([numeric_only, min_count]) | Compute min of group values. | GroupBy.min(numeric_only=False, min_count=- 1) | ||
GroupBy.ngroup([ascending]) | Number each group from 0 to the number of groups - 1. | GroupBy.ngroup(ascending=True) | ||
GroupBy.nth(n[, dropna]) | 如果参数n是一个整数,则取每个组的第n行;如果n是一个整数列表,则取每组行的子集。 | GroupBy.nth(n, dropna=None) | ||
GroupBy.ohlc() | 计算组的开始值,最高值,最低值和末尾值,不包括缺失值。 | GroupBy.ohlc() | ||
GroupBy.pad() | Forward fill the values. | GroupBy.pad(limit=None) | DataFrameGroupBy.pad(limit=None) | |
GroupBy.prod([numeric_only, min_count]) | Compute prod of group values. | GroupBy.prod(numeric_only=True, min_count=0) | ||
GroupBy.rank([method, ascending, na_option, …]) | Provide the rank of values within each group. | GroupBy.rank(method='average', ascending=True, na_option='keep', pct=False, axis=0) | DataFrameGroupBy.rank(method='average', ascending=True, na_option='keep', pct=False, axis=0) | |
GroupBy.pct_change([periods, fill_method, …]) | Calculate pct_change of each value to previous entry in group. | GroupBy.pct_change(periods=1, fill_method='pad', limit=None, freq=None, axis=0) | DataFrameGroupBy.pct_change(periods=1, fill_method='pad', limit=None, freq=None, axis=0) | |
GroupBy.size() | Compute group sizes. | GroupBy.size() | DataFrameGroupBy.size() | |
GroupBy.sem() | Compute standard error of the mean of groups, excluding missing values. | GroupBy.sem(ddof=1) | ||
GroupBy.std() | Compute standard deviation of groups, excluding missing values. | GroupBy.std(ddof=1) | ||
GroupBy.sum([numeric_only, min_count]) | Compute sum of group values. | GroupBy.sum(numeric_only=True, min_count=0) | ||
GroupBy.var([ddof]) | Compute variance of groups, excluding missing values. | GroupBy.var(ddof=1) | ||
GroupBy.tail() | 返回每组的最后n行,默认5行 | GroupBy.tail(n=5) |
时间序列
概览
Pandas把时间相关分为4种概念,用8个类来表示。
概念 | 描述 | 标量类 | 数组类 | pandas数据类型 | 主要创建方法 | 示例 |
---|---|---|---|---|---|---|
日期时间 | 支持时区的特定日期时间点。 类似Python标准库的datetime.datetime。 |
Timestamp | DatetimeIndex | datetime64[ns] 或 datetime64[ns, tz] |
to_datetime() date_range() |
pd.to_datetime('2020-01-01') 生成:Timestamp('2020-01-01 00:00:00')
|
时间增量 | 持续时间,即两个日期或时间的差值。 类似Python标准库的datetime.timedelta。 |
Timedelta | TimedeltaIndex | timedelta64[ns] | to_timedelta() timedelta_range() |
|
时间跨度 | 由时间点及其关联的频率定义的时间跨度。 | Period | PeriodIndex | period[freq] | Period() period_range() |
|
日期偏移 | 日期增量 | DateOffset | None | None | DateOffset() |
了解更多 >> pandas 用户指南:时间序列
日期时间属性
以下是Timestamp类和DatetimeIndex类的一些属性或方法。
属性 | 描述 | 示例 |
---|---|---|
year | 年 | |
month | 月 | |
day | 日 | |
hour | 小时 | |
minute | 分钟 | |
second | 秒 | |
microsecond | 微秒 | |
nanosecond | 纳秒 | |
date | 日期(不包含时区信息) | |
time | 时间(不包含时区信息) | |
timetz() | 时间(包含本地时区信息) | |
day_of_year / dayofyear | 一年里的第几天 | |
week / weekofyear | 一年里的第几周 | |
day_of_week / dayofweek / weekday | 一周里的第几天,Monday(星期一)=0,Sunday(星期天)=6 | |
quarter | 日期所处的季度,如(1月、2月、3月)=1,(4月、5月、6月)=2 | |
days_in_month | 日期所在的月有多少天 | |
is_month_start | 是否月初(由频率定义) | |
is_month_end | 是否月末(由频率定义) | |
is_quarter_start | 是否季初(由频率定义) | |
is_quarter_end | 是否季末(由频率定义) | |
is_year_start | 是否年初(由频率定义) | |
is_year_end | 是否年末(由频率定义) | |
is_leap_year | 是否闰年 |
日期偏移
DateOffset对象用来处理日期偏移。
日期偏移量 | 频率字符串 | 描述 | 示例 |
---|---|---|---|
DateOffset | 无 | 通用偏移类,默认为24小时 | |
Day | 'D' | 一天 | |
Hour | 'H' | 一小时 | |
Minute | 'T' 或 'min' | 一分钟 | |
Second | 'S' | 一秒 | |
Milli | 'L' 或 'ms' | 一毫秒 | |
Micro | 'U' 或 'us' | 一微秒 | |
Nano | 'N' | 一纳秒 | |
BDay 或 BusinessDay | 'B' | 工作日 | |
CDay 或 CustomBusinessDay | 'C' | 自定义工作日 | |
Week | 'W' | 一周,可选锚定周几 | |
WeekOfMonth | 'WOM' | 每月第几周的第几天 | |
LastWeekOfMonth | 'LWOM' | 每月最后一周的第几天 | |
MonthEnd | 'M' | 日历月末 | |
MonthBegin | 'MS' | 日历月初 | |
BMonthEnd 或 BusinessMonthEnd | 'BM' | 工作日月末 | |
BMonthBegin 或 BusinessMonthBegin | 'BMS' | 工作日月初 | |
CBMonthEnd 或 CustomBusinessMonthEnd | 'CBM' | 自定义工作日月末 | |
CBMonthBegin 或 CustomBusinessMonthBegin | 'CBMS' | 自定义工作日月初 | |
SemiMonthEnd | 'SM' | 月第15天(或其他天数)与日历月末 | |
SemiMonthBegin | 'SMS' | 日历月初与月第15天(或其他天数) | |
QuarterEnd | 'Q' | 日历季末 | |
QuarterBegin | 'QS' | 日历季初 | |
BQuarterEnd | 'BQ | 工作季末 | |
BQuarterBegin | 'BQS' | 工作季初 | |
FY5253Quarter | 'REQ' | 零售(又名 52-53 周)季 | |
YearEnd | 'A' | 日历年末 | |
YearBegin | 'AS' 或 'BYS' | 日历年初 | |
BYearEnd | 'BA' | 工作日年末 | |
BYearBegin | 'BAS' | 工作日年初 | |
FY5253 | 'RE' | 零售(又名 52-53 周)年 | |
Easter | 无 | 复活节假日 | |
BusinessHour | 'BH' | 工作小时 | |
CustomBusinessHour | 'CBH' | 自定义工作小时 |
时间序列相关
属性/方法 | 描述 | Series | DataFrame | 示例 |
---|---|---|---|---|
asfreq() | Convert TimeSeries to specified frequency. | Series.asfreq(freq, method=None, how=None, normalize=False, fill_value=None) | DataFrame.asfreq(freq, method=None, how=None, normalize=False, fill_value=None) | |
asof() | Return the last row(s) without any NaNs before where. | Series.asof(where, subset=None) | DataFrame.asof(where, subset=None) | |
shift() | Shift index by desired number of periods with an optional time freq. | Series.shift(periods=1, freq=None, axis=0, fill_value=None) | DataFrame.shift(periods=1, freq=None, axis=0, fill_value=None) | |
slice_shift() | Equivalent to shift without copying data. | Series.slice_shift(periods=1, axis=0) | DataFrame.slice_shift(periods=1, axis=0) | |
tshift() | (DEPRECATED) Shift the time index, using the index’s frequency if available. | Series.tshift(periods=1, freq=None, axis=0) | DataFrame.tshift(periods=1, freq=None, axis=0) | |
first_valid_index() | Return index for first non-NA/null value. | Series.first_valid_index() | DataFrame.first_valid_index() | |
last_valid_index() | Return index for last non-NA/null value. | Series.last_valid_index() | DataFrame.last_valid_index() | |
resample() | Resample time-series data. | Series.resample(rule, axis=0, closed=None, label=None, convention='start', kind=None, loffset=None, base=None, on=None, level=None, origin='start_day', offset=None) | DataFrame.resample(rule, axis=0, closed=None, label=None, convention='start', kind=None, loffset=None, base=None, on=None, level=None, origin='start_day', offset=None) | |
to_period() | Convert DataFrame from DatetimeIndex to PeriodIndex. | Series.to_period(freq=None, copy=True) | DataFrame.to_period(freq=None, axis=0, copy=True) | |
to_timestamp() | Cast to DatetimeIndex of timestamps, at beginning of period. | Series.to_timestamp(freq=None, how='start', copy=True) | DataFrame.to_timestamp(freq=None, how='start', axis=0, copy=True) | |
tz_convert() | Convert tz-aware axis to target time zone. | Series.tz_convert(tz, axis=0, level=None, copy=True) | DataFrame.tz_convert(tz, axis=0, level=None, copy=True) | |
tz_localize() | Localize tz-naive index of a Series or DataFrame to target time zone. | Series.tz_localize(tz, axis=0, level=None, copy=True, ambiguous='raise', nonexistent='raise') | DataFrame.tz_localize(tz, axis=0, level=None, copy=True, ambiguous='raise', nonexistent='raise') |
绘图
pandas绘图基于Matplotlib,pandas的DataFrame和Series都自带生成各类图表的plot方法,能够方便快速生成各种图表。
了解更多 >> pandas 用户指南:可视化
基本图形
折线图
plot方法默认生成的就是折线图。如prices是一个DataFrame的含有收盘价close列,绘制收盘价的折线图:
s = prices['close']
s.plot()
#设置图片大小,使用figsize参数
s.plot(figsize=(20,10))
条形图
对于不连续标签,没有时间序列的数据,可以绘制条形图,使用以下两种方法:
- 使用plot()函数,设置kind参数为‘bar’ or ‘barh’,
- 使用plot.bar()函数,plot.barh()函数
df.plot(kind='bar') #假设df为每天股票数据
df.plot.bar()
df.resample('A-DEC').mean().volume.plot(kind='bar') #重采集每年成交量平均值,绘制条形图(volume为df的成交量列)
df.plot.bar(stacked=True) #stacked=True表示堆积条形图
df.plot.barh(stacked=True) #barh 表示水平条形图 </nowiki>
直方图
直方图使用plot.hist()方法绘制,一般为频数分布直方图,x轴分区间,y轴为频数。组数用参数bins控制,如分20组bins=20
df.volume.plot.hist() #df股票数据中成交量volume的频数分布直方图。
df.plot.hist(alpha=0.5) #alpha=0.5 表示柱形的透明度为0.5
df.plot.hist(stacked=True, bins=20) #stacked=True表示堆积绘制,bins=20表示分20组。
df.plot.hist(orientation='horizontal') #orientation='horizontal' 表示水平直方图
df.plot.hist(cumulative=True) #表示累计直方图
df['close'].diff().hist() #收盘价上应用diff函数,再绘制直方图
df.hist(color='k', bins=50) #DataFrame.hist函数将每列绘制在不同的子图形上。
箱型图
箱型图可以使用plot.box()函数或DataFrame的boxplot()绘制。 参数:
- color,用来设置颜色,通过传入颜色字典,如color={'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', 'medians': 'DarkBlue', 'caps': 'Gray'}
- sym,用来设置异常值样式,如sym='r+'表示异常值用'红色+'表示。
df.plot.box()
df[['close','open', 'high']].plot.box()
#改变箱型颜色,通过传入颜色字典
color={'boxes': 'DarkGreen', 'whiskers': 'DarkOrange', 'medians': 'DarkBlue', 'caps': 'Gray'}
df.plot.box(color=color, sym='r+') #sym用来设置异常值样式,'r+'表示'红色+'
df.plot.box(positions=[1, 4, 5, 6, 8]) #positions表示显示位置,df有5个列, 第一列显示在x轴1上,第二列显示在x轴4上,以此类推
df.plot.box(vert=False) #表示绘制水平箱型图
df.boxplot()
#绘制分层箱型图,通过设置by关键词创建分组,再按组,分别绘制箱型图。如下面例子,每列按A组,B组分别绘制箱型图。
df = pd.DataFrame(np.random.rand(10, 2), columns=['Col1', 'Col2'])
df['x'] = pd.Series(['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'])
df.boxplot(by='x')
#还可以再传入一个子分类,再进一步分组绘制。如:
df.boxplot(column=['Col1', 'Col2'], by=['X', 'Y'])
散点图
散点图使用DataFrame.plot.scatter()方法绘制。通过参数x,y指定x轴和y轴的数据列。
df.plot.scatter(x='close', y='volume') #假如df为每日股票数据,图表示收盘价与成交量的散点图
#将两组散点图绘制在一张图表上,重新ax参数如
ax = df.plot.scatter(x='close', y='volume', color='DarkBlue', label='Group 1') #设置标签名label设置标名
df.plot.scatter(x='open', y='value', color='DarkGreen', label='Group 2', ax=ax)
#c参数表示圆点的颜色按按volume列大小来渐变表示。
df.plot.scatter(x='close', y='open', c='volume', s=50) #s表示原点面积大小
df.plot.scatter(x='close', y='open', s=df['volume']/50000) #圆点的大小也可以根据某列数值大小相应设置。
饼图
饼图使用DataFrame.plot.pie()或Series.plot.pie()绘制。如果数据中有空值,会自动使用0填充。
其他绘图函数
这些绘图函数来自pandas.plotting模块。
矩阵散点图(Scatter Matrix Plot)
矩阵散点图(Scatter Matrix Plot)使用scatter_matrix()方法绘制
from pandas.plotting import scatter_matrix #使用前需要从模块中导入该函数
scatter_matrix(df, alpha=0.2, figsize=(6, 6), diagonal='kde') #假设df是每日股票数据,会每一列相对其他每一列生成一个散点图。
密度图(Density Plot)
密度图使用Series.plot.kde()和DataFrame.plot.kde()函数。
df.plot.kde()
安德鲁斯曲线(Andrews Curves)
安德鲁斯曲线
平行坐标图(Parallel Coordinates)
Lag plot
自相关图(Autocorrelation Plot)
自相关图
自举图(Bootstrap plot)
绘图格式
预设置图形样式
matplotlib 从1.5开始,可以预先设置样式,绘图前通过matplotlib.style.use(my_plot_style)。如matplotlib.style.use('ggplot') 定义ggplot-style plots.
样式参数
大多数绘图函数,可以通过一组参数来设置颜色。
标签设置
可通过设置legend参数为False来隐藏图片标签,如
df.plot(legend=False)
尺度
- logy参数用来将y轴设置对数标尺
- logx参数用来将x轴设置对数标尺
- loglog参数用来将x轴和y轴设置对数标尺
ts.plot(logy=True)
双坐标图
两组序列同x轴,但y轴数据不同,可以通过第二个序列设置参数:secondary_y=True,来设置第二个y轴。
#比如想在收盘价图形上显示cci指标:
prices['close'].plot()
prices['cci'].plot(secondary_y=True)
#第二个坐标轴要显示多个,可以直接传入列名
ax = df.plot(secondary_y=['cci', 'RSI'], mark_right=False) #右边轴数据标签默认会加个右边,设置mark_right为False取消显示
ax.set_ylabel('CD scale') #设置左边y轴名称
ax.right_ax.set_ylabel('AB scale') #设置右边y轴名称
子图
DataFrame的每一列可以绘制在不同的坐标轴(axis)中,使用subplots参数设置,例如:
df.plot(subplots=True, figsize=(6, 6))
子图布局
子图布局使用关键词layout设置,
输入输出
pandas的读取函数是顶层函数,如pandas.read_csv()一般返回一个pandas对象。写入函数是相应对象的方法,如DataFrame.to_csv()将DataFrame对象写入到csv文件。下表是可用的读取和写入函数。
数据描述 | 格式类型 | 读取函数 | 写入函数 |
---|---|---|---|
CSV | text | read_csv | to_csv |
Fixed-Width Text File | text | read_fwf | |
JSON | text | read_json | to_json |
HTML | text | read_html | to_html |
Local clipboard | text | read_clipboard | to_clipboard |
MS Excel | read_excel | to_excel | |
OpenDocument | binary | read_excel | |
HDF5 Format | binary | read_hdf | to_hdf |
Feather Format | binary | read_feather | to_feather |
Parquet Format | binary | read_parquet | to_parquet |
ORC Format | binary | read_orc | |
Msgpack | binary | read_msgpack | to_msgpack |
Stata | binary | read_stata | to_stata |
SAS | binary | read_sas | |
SPSS | binary | read_spss | |
Python Pickle Format | binary | read_pickle | to_pickle |
SQL | SQL | read_sql | to_sql |
Google BigQuery | SQL | read_gbq | to_gbq |
资源
官网
相关网站
书籍
《利用Python进行数据分析 第2版》 - Wes McKinney