pandas note1

pandas is a python-based data analysis package that provides the tools needed to efficiently manipulate large datasets as well as functions and methods to process data quickly and easily.
1. Import pandas library

"""
In [1]: import pandas as pd
"""

Series object (used to store one-dimensional data)
1. Declare the Series object

"""
In [2]: s = pd.Series([6,5,9,-3])
In [3]: s
Out[3]: 
0    6
1    5
2    9
3   -3
dtype: int64
# 自定义index选项
In [7]: s = pd.Series([6,5,9,-3], index=['a', 'b', 'c', 'd'])
In [8]: s
Out[8]: 
a    6
b    5
c    9
d   -3
dtype: int64
# data除了set和多维数组外,均可作为Series的data
In [11]: import numpy as np
In [12]: arr = np.array([1, 2, 3, 4]) # 一维array
In [13]: s=pd.Series(arr)
In [14]: s
Out[14]: 
0    1
1    2
2    3
3    4
dtype: int32
"""

2. Get the elements inside the Series object

"""
#获取Series对象内部的元素(标签)
In [17]: s['b']
Out[17]: 5
#获取Series对象内部的元素(指定键)
In [19]: arr[2]
Out[19]: 3
# 查找尽量用索引(标签),键不稳定(两个连续的索引重复,会报IndexERROR)
"""

3. Assignment

"""
# 赋值的时候,如果赋的是未知索引的值----等价于,添加一个新的索引及其值。
In [20]: s['b']=5
In [21]: arr[2]=2
"""

4. Filter (select some elements)

"""
In [21]: s
Out[21]: 
a    6
b    5
c    9
d   -3
dtype: int64
In [22]: s>0
Out[22]: 
a     True
b     True
c     True
d    False
dtype: bool
In [23]: s[s>0]
Out[23]: 
a    6
b    5
c    9
dtype: int64
"""

5. Several methods (unique, value_counts, isin)

"""
In [26]: pds = pd.Series([0,1,2,3,3,4], index=['one', 'two', 'three', 'four', 'five', 'six'])
In [27]: pds
Out[27]: 
one      0
two      1
three    2
four     3
five     3
six      4
dtype: int64
#Series对象包含多少个不同的元素,返回结果为一个数组array,包含Series去重后的元#素,但顺序不定
In [30]: pds.unique()
Out[30]: array([0, 1, 2, 3, 4], dtype=int64)
# 返回各个不同的元素,还计算每个元素在Series中的出现次数
In [31]: pds.value_counts()
Out[31]: 
3    2
4    1
2    1
1    1
0    1
dtype: int64
# isin()函数用来判断所属关系,返回布尔值,用于筛选数据,参数要求一定是list形式
In [32]: pds.isin([0, 3])
Out[32]: 
one       True
two      False
three    False
four      True
five      True
six      False
dtype: bool
In [33]: pds[pds.isin([0, 3])]
Out[33]: 
one     0
four    3
five    3
dtype: int64
"""

6. NaN (not a numeric value)

"""
#数据结构中若字段为空或者不符合数字的定义时,用这个特定的值来表示。
In [34]: f = pd.Series([0,np.NaN, 7, -1])
In [35]: f
Out[35]: 
0    0.0
1    NaN
2    7.0
3   -1.0
dtype: float64
#isnull/notnull/f[f.isnull]/f[f.notnull]
In [36]: f.isnull()
Out[36]: 
0    False
1     True
2    False
3    False
dtype: bool
In [37]: f.notnull()
Out[37]: 
0     True
1    False
2     True
3     True
dtype: bool
In [38]: f[f.isnull()]
Out[38]: 
1   NaN
dtype: float64
In [39]: f[f.notnull()]
Out[39]: 
0    0.0
2    7.0
3   -1.0
dtype: float64
"""

7. Use a dictionary to create a Series object

"""
In [42]: d = {'one':1, 'two':2, 'three':3, 'four':4}
In [43]: myseries = pd.Series(d)
In [44]: nums = ['one', 'two', 'three', 'four', 'five']
"""
# 可以单独指定索引,如遇缺失值处,pandas就会为其添加NaN
In [45]: myseries = pd.Series(d, index = nums)
In [46]: myseries
Out[46]: 
one      1.0
two      2.0
three    3.0
four     4.0
five     NaN
dtype: float64
"""

8. idxmin/idxmax (returns the index of the minimum and maximum values)

"""
In [52]:pds = pd.Series([0,1,2,3,3,4], index=['one', 'two', 'three', 'four', 'five', 'six'])
In [52]: pds
Out[52]: 
one      0
two      1
three    2
four     3
five     3
six      4
dtype: int64
In [53]: pds.idxmin()
Out[53]: 'one'
In [54]: pds.idxmax()
Out[54]: 'six'
"""

9、is_unique

"""
#查看数据结构中是否存在重复的索引项
In [60]: pds = pd.Series(range(6), index=['one', 'two', 'three','three', 'four', 'five'])
In [61]: pds.index.is_unique
Out[61]: False
"""

10. Change the index

"""
In [79]: ser = pd.Series([2, 5, 4, 7])
In [80]: ser
Out[80]: 
0    2
1    5
2    4
3    7
dtype: int64
In [81]: ser = pd.Series([2, 5, 4, 7], index = ['one', 'two', 'three', 'four'])
In [82]: ser
Out[82]: 
one      2
two      5
three    4
four     7
dtype: int64
In [83]: ser.reindex(['three', 'four', 'five', 'one'])
Out[83]: 
three    4.0
four     7.0
five     NaN
one      2.0
dtype: float64
In [88]: ser1 = pd.Series([2, 10, 6,7], index = [0, 3, 5, 6])
In [89]: ser1
Out[89]: 
0     2
3    10
5     6
6     7
dtype: int64
"""
#新Series对象添加了原Series对象缺失的索引项。新插入的索引项,其元素为前面索引编#号比它小的那一项的元素。
In [90]: ser1.reindex(range(6),  method='ffill')
Out[90]: 
0     2
1     2
2     2
3    10
4    10
5     6
dtype: int64
#用新插入索引后面的元素,需要用bfill方法
In [91]: ser1.reindex(range(6),  method='bfill')
Out[91]: 
0     2
1    10
2    10
3    10
4     6
5     6
dtype: int64
"""

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325579699&siteId=291194637