林子雨编著《数据采集与预处理》教材配套代码（教材官网）
查看所有章节代码

第8章使用pandas进行数据清洗

>>> import numpy as np
>>> a = [1,2,3,4,5]     # 创建简单的列表
>>> b = np.array(a)    # 将列表转换为数组
>>> b
array([1, 2, 3, 4, 5])
>>> b.size  # 数组的元素个数
5
>>> b.shape  # 数组的形状
(5,)
>>> b.ndim  # 数组的维度
1
>>> b.dtype  # 数据的元素类型
dtype('int32')
>>> print(b[0],b[1],b[2])  #访问数组元素
1 2 3
>>> b[4] = 6  # 修改数组元素
>>> b
array([1, 2, 3, 4, 6])
>>> c = np.array([[1,2,3],[4,5,6]])  # 创建二维数组
>>> c.shape     # 数组的形状
(2, 3)
>>> print(c[0,0],c[0,1],c[0,2],c[1,0],c[1,1],c[1,2])
1 2 3 4 5 6

>>> a = np.zeros([2,3],int)
>>> a
array([[0, 0, 0],
       [0, 0, 0]])
>>> a = np.ones([2,3],int)
>>> a
array([[1, 1, 1],
       [1, 1, 1]])
>>> a = np.empty([2,3],int)
>>> a
array([[0, 0, 0],
       [0, 0, 0]])
>>> a = np.eye(3,k=1,dtype=int)
>>> a
array([[0, 1, 0],
       [0, 0, 1],
       [0, 0, 0]])
>>> a = np.eye(4,k=-2,dtype=int)
>>> a
array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 1, 0, 0]])
>>> a = np.full((2,3),4)
>>> a
array([[4, 4, 4],
       [4, 4, 4]])
>>> a = np.random.random((2,3))
>>> a
array([[0.46657535, 0.2398773 , 0.18675721],
       [0.30525201, 0.66826887, 0.5708038 ]])
>>> a = np.random.randint(2, size=10)
>>> a
array([0, 1, 0, 0, 1, 1, 0, 0, 1, 1])
>>> b = np.random.randint(5, size=(2, 4))
>>> b
array([[1, 2, 3, 3],
       [0, 0, 2, 4]])
>>> a = np.random.rand(4,2)
>>> a
array([[0.22225254, 0.25555882],
       [0.69250455, 0.62957494],
       [0.567664  , 0.30459249],
       [0.16394031, 0.00900947]])
>>> a = np.random.randn(2,4)
>>> a
array([[-0.28183753, -0.4931384 , -2.11355842,  0.17782074],
       [-1.14089585,  0.816798  ,  0.39287532, -0.19339946]])
>>> a = np.arange(10)
>>> a
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> a[5]
5
>>> a[5:8]
array([5, 6, 7])
>>> a[5:8] = 12
>>> a
array([ 0,  1,  2,  3,  4, 12, 12, 12,  8,  9])
>>> a = np.arange(10)
>>> a_slice = a[5:8]
>>> a_slice[0] = -1
>>> a_slice
array([-1,  6,  7])
>>> a
array([ 0,  1,  2,  3,  4, -1,  6,  7,  8,  9])
>>> b = np.array([[1,2,3],[4,5,6],[7,8,9]])
>>> b[2]
array([7, 8, 9])
>>> b[0][2]
3
>>> b[0,2]
3
>>> b[:2]
array([[1, 2, 3],
       [4, 5, 6]])
>>> b[:2,1:]
array([[2, 3],
       [5, 6]])
>>> b[1,:2]
array([4, 5])
>>> b[:2,2]
array([3, 6])
>>> b[:,:1]
array([[1],
       [4],
       [7]])

>>> a = np.array([[1,2,3],[4,5,6]])
>>> a*a
array([[ 1,  4,  9],
       [16, 25, 36]])
>>> a-a
array([[0, 0, 0],
       [0, 0, 0]])
>>> 1/a
array([[1.        , 0.5       , 0.33333333],
       [0.25      , 0.2       , 0.16666667]])
>>> a+a
array([[ 2,  4,  6],
           [ 8, 10, 12]])
>>> np.exp(a)  # e的幂次方
array([[  2.71828183,   7.3890561 ,  20.08553692],
       [ 54.59815003, 148.4131591 , 403.42879349]])
>>> np.sqrt(a)
array([[1.        , 1.41421356, 1.73205081],
       [2.        , 2.23606798, 2.44948974]])
>>> a**2
array([[ 1,  4,  9],
       [16, 25, 36]], dtype=int32)

> pip install pandas

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> obj=Series([3,5,6,8,9,2])
>>> obj
0    3
1    5
2    6
3    8
4    9
5    2
dtype: int64
>>> obj.index
RangeIndex(start=0, stop=6, step=1)
>>> obj2=Series([3,5,6,8,9,2],index=['a','b','c','d','e','f'])
>>> obj2
a    3
b    5
c    6
d    8
e    9
f    2
dtype: int64
>>> obj2.index
Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
>>> obj2['a']
3
>>> obj2[['b','d','f']]
b    5
d    8
f    2
dtype: int64
>>> obj2[obj2>5]
c    6
d    8
e    9
dtype: int64
>>> obj2*2    #乘以2
a     6
b    10
c    12
d    16
e    18
f     4
dtype: int64
>>> np.exp(obj2)    #求e的幂次方
a      20.085537
b     148.413159
c     403.428793
d    2980.957987
e    8103.083928
f       7.389056
dtype: float64
>>> 'b' in obj2
True
>>> 'm' in obj2
False
>>> dic={'m':4,'n':5,'p':6}
>>> obj3=Series(dic)
>>> obj3
m    4
n    5
p    6
dtype: int64
>>> ind=['m','n','p','a']
>>> obj4=Series(dic,index=ind)
>>> obj4
m    4.0
n    5.0
p    6.0
a    NaN
dtype: float64
>>> pd.isnull(obj4)
m    False
n    False
p    False
a     True
dtype: bool
>>> pd.notnull(obj4)
m     True
n     True
p     True
a    False
dtype: bool
>>> obj3+obj4
a     NaN
m     8.0
n    10.0
p    12.0
dtype: float64
>>> obj4.name='sereis_a'
>>> obj4.index.name='letter'
>>> obj4
letter
m    4.0
n    5.0
p    6.0
a    NaN
Name: sereis_a, dtype: float64
>>> obj4.index=['u','v','w','a']
>>> obj4
u    4.0
v    5.0
w    6.0
a    NaN
Name: sereis_a, dtype: float64

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> data = {'sno':['95001', '95002', '95003', '95004'],
    'name':['Xiaoming','Zhangsan','Lisi','Wangwu'],
    'sex':['M','F','F','M'],
    'age':[22,25,24,23]}
>>> frame=DataFrame(data)
>>> frame
     sno     name  sex  age
0  95001  Xiaoming   M   22
1  95002  Zhangsan   F   25
2  95003      Lisi   F   24
3  95004    Wangwu   M   23

>>> frame=DataFrame(data,columns=['name','sno','sex','age'])
>>> frame
       name    sno  sex  age
0  Xiaoming  95001   M   22
1  Zhangsan  95002   F   25
2      Lisi  95003   F   24
3    Wangwu  95004   M   23

>>> frame=DataFrame(data,columns=['sno','name','sex','age','grade'])
>>> frame
     sno      name sex  age grade
0  95001  Xiaoming   M   22   NaN
1  95002  Zhangsan   F   25   NaN
2  95003      Lisi   F   24   NaN
3  95004    Wangwu   M   23   NaN

>>> frame=DataFrame(data,columns=['sno','name','sex','age','grade'],index=['a','b','c','d'])
>>> frame
     sno      name sex  age grade
a  95001  Xiaoming   M   22   NaN
b  95002  Zhangsan   F   25   NaN
c  95003      Lisi   F   24   NaN
d  95004    Wangwu   M   23   NaN

>>> frame['sno']
a    95001
b    95002
c    95003
d    95004
Name: sno, dtype: object
>>> frame.name
a    Xiaoming
b    Zhangsan
c        Lisi
d      Wangwu
Name: name, dtype: object

>>> frame.loc['b']
sno         95002
name     Zhangsan
sex             F
age            25
grade         NaN
Name: b, dtype: object
>>> frame.iloc[1]
sno         95002
name     Zhangsan
sex             F
age            25
grade         NaN
Name: b, dtype: object

>>> frame.loc['b':'c']
     sno      name sex  age grade
b  95002  Zhangsan   F   25   NaN
c  95003      Lisi   F   24   NaN
>>> frame.iloc[2:4]
     sno    name sex  age grade
c  95003    Lisi   F   24   NaN
d  95004  Wangwu   M   23   NaN

>>> frame.loc[:,['sex']]
  sex
a   M
b   F
c   F
d   M

>>> frame.loc[:,'sex':]
  sex  age grade
a   M   22   NaN
b   F   25   NaN
c   F   24   NaN
d   M   23   NaN
>>> frame.iloc[:,1:4]
       name sex  age
a  Xiaoming   M   22
b  Zhangsan   F   25
c      Lisi   F   24
d    Wangwu   M   23
>>> frame['grade']=[93,89,72,84]
>>> frame
     sno      name sex  age  grade
a  95001  Xiaoming   M   22     93
b  95002  Zhangsan   F   25     89
c  95003      Lisi   F   24     72
d  95004    Wangwu   M   23     84
>>> frame['grade']=Series([67,89],index=['a','c'])
>>> frame
     sno      name sex  age  grade
a  95001  Xiaoming   M   22   67.0
b  95002  Zhangsan   F   25    NaN
c  95003      Lisi   F   24   89.0
d  95004    Wangwu   M   23    NaN
>>> frame['province']=['ZheJiang','FuJian','Beijing','ShangHai']
>>> frame
     sno      name sex  age  grade  province
a  95001  Xiaoming   M   22   67.0  ZheJiang
b  95002  Zhangsan   F   25    NaN    FuJian
c  95003      Lisi   F   24   89.0   Beijing
d  95004    Wangwu   M   23    NaN  ShangHai
>>> del frame['province']
>>> frame
     sno      name sex  age  grade
a  95001  Xiaoming   M   22   67.0
b  95002  Zhangsan   F   25    NaN
c  95003      Lisi   F   24   89.0
d  95004    Wangwu   M   23    NaN
>>> dic={'computer':{2020:78,2021:82},'math':{2019:76,2020:78,2021:81}}
>>> frame1=DataFrame(dic)
>>> frame1
      computer  math
2020      78.0    78
2021      82.0    81
2019       NaN    76
>>> frame1.T
          2020  2021  2019
computer  78.0  82.0   NaN
math      78.0  81.0  76.0
>>> frame2=DataFrame(dic,index=[2020,2021,2022])
>>> frame2
      computer  math
2020      78.0  78.0
2021      82.0  81.0
2022       NaN   NaN

>>> import numpy as np
>>> import pandas as pd
>>> #用顺序数np.arange(12).reshape(3,4)
>>> df1=pd.DataFrame(np.arange(12).reshape(3,4),columns=['a','b','c','d'])
>>> df1
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
>>> #用随机数np.random.randint(20,size=(2,3))
>>> df2=pd.DataFrame(np.random.randint(20,size=(2,3)),columns=['b','d','a'])
>>> df2
    b   d  a
0   0  19  4
1  10   2  5
>>> #用随机数np.random.randn(5,3)
>>> df3=pd.DataFrame(np.random.randn(5,3),index=list('abcde'),columns=['one','two','three'])
>>> df3
        one       two     three
a -0.204225 -0.402101 -0.528857
b  0.070463 -1.203973 -1.271088
c -1.210856  0.438507  1.442583
d -0.101521  1.283724 -0.101034
e -1.256007 -0.112633 -1.590732
>>> import pandas as pd
>>> from pandas import DataFrame
>>> csv_df = pd.read_csv(‘C:\\Python38\my_file.csv’)
>>> table_df = pd.read_table(‘C:\\Python38\my_table.txt’)

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame,Index
>>> # 获取Index对象
>>> x = Series(range(3), index = ['a', 'b', 'c'])
>>> index = x.index
>>> index
Index(['a', 'b', 'c'], dtype='object')
>>> index[0:2]
Index(['a', 'b'], dtype='object')
>>> # 构造/使用Index对象
>>> index = Index(np.arange(3))
>>> obj2 = Series([2.5, -3.5, 0], index = index)
>>> obj2
0    2.5
1   -3.5
2    0.0
dtype: float64
>>> obj2.index is index
True
>>> # 判断列/行索引是否存在
>>> data = {'pop':[2.3,2.6],
    'year':[2020,2021]}
>>> frame = DataFrame(data)
>>> frame
   pop  year
0  2.3  2020
1  2.6  2021
>>> 'pop' in frame.columns
True
>>> 1 in frame.index
True

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
    >>> s1 = Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D'])
>>> s1
A    1
B    2
C    3
D    4
dtype: int64
>>> # 重新指定index， 多出来的index，可以使用fill_value填充
>>> s1.reindex(index=['A', 'B', 'C', 'D', 'E'], fill_value = 10)
A     1
B     2
C     3
D     4
E    10
dtype: int64
>>> s2 = Series(['A', 'B', 'C'], index = [1, 5, 10])
>>> # 修改索引，将s2的索引增加到15个，如果新增加的索引值不存在，默认为NaN
>>> s2.reindex(index=range(15))
0     NaN
1       A
2     NaN
3     NaN
4     NaN
5       B
6     NaN
7     NaN
8     NaN
9     NaN
10      C
11    NaN
12    NaN
13    NaN
14    NaN
dtype: object
>>> # ffill： 表示forward fill，向前填充
>>> # 如果新增加索引的值不存在，那么按照前一个非NaN的值填充进去
>>> s2.reindex(index=range(15), method='ffill')
0     NaN
1       A
2       A
3       A
4       A
5       B
6       B
7       B
8       B
9       B
10      C
11      C
12      C
13      C
14      C
dtype: object
    >>> # 减少index
>>> s1.reindex(['A', 'B'])
A    1
B    2
dtype: int64

>>> df1 = DataFrame(np.random.rand(25).reshape([5, 5]), index=['A', 'B', 'D', 'E', 'F'], columns=['c1', 'c2', 'c3', 'c4', 'c5'])
>>> df1
         c1        c2        c3        c4        c5
A  0.077539  0.574105  0.868985  0.305669  0.738754
B  0.939470  0.464108  0.951791  0.277599  0.091289
D  0.019077  0.850392  0.069981  0.397684  0.526270
E  0.564420  0.723089  0.971805  0.501211  0.641450
F  0.308109  0.831558  0.215271  0.729247  0.944689
>>> # 为DataFrame添加一个新的索引
>>> # 可以看到自动扩充为NaN
>>> df1.reindex(index=['A', 'B', 'C', 'D', 'E', 'F'])
         c1        c2        c3        c4        c5
A  0.077539  0.574105  0.868985  0.305669  0.738754
B  0.939470  0.464108  0.951791  0.277599  0.091289
C       NaN       NaN       NaN       NaN       NaN
D  0.019077  0.850392  0.069981  0.397684  0.526270
E  0.564420  0.723089  0.971805  0.501211  0.641450
F  0.308109  0.831558  0.215271  0.729247  0.944689
>>> # 扩充列
>>> df1.reindex(columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])
         c1        c2        c3        c4        c5  c6
A  0.077539  0.574105  0.868985  0.305669  0.738754 NaN
B  0.939470  0.464108  0.951791  0.277599  0.091289 NaN
D  0.019077  0.850392  0.069981  0.397684  0.526270 NaN
E  0.564420  0.723089  0.971805  0.501211  0.641450 NaN
F  0.308109  0.831558  0.215271  0.729247  0.944689 NaN
>>> # 减少index
>>> df1.reindex(index=['A', 'B'])
         c1        c2        c3        c4        c5
A  0.077539  0.574105  0.868985  0.305669  0.738754
B  0.939470  0.464108  0.951791  0.277599  0.091289

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> # Series根据行索引删除行
>>> s1 = Series(np.arange(4), index = ['a', 'b', 'c','d'])
>>> s1
a    0
b    1
c    2
d    3
dtype: int32
>>> s1.drop(['a', 'b'])
c    2
d    3
dtype: int32
>>> # DataFrame根据索引行/列删除行/列
>>> df1 = DataFrame(np.arange(16).reshape((4, 4)),
          index = ['a', 'b', 'c', 'd'],
          columns = ['A', 'B', 'C', 'D'])
>>> df1
    A   B   C   D
a   0   1   2   3
b   4   5   6   7
c   8   9  10  11
d  12  13  14  15
>>> df1.drop(['A','B'],axis=1)   #在列的维度上删除AB两行，axis值为1表示列的维度
    C   D
a   2   3
b   6   7
c  10  11
d  14  15
>>> df1.drop('a', axis = 0)   #在行的维度上删除行，axis值为0表示行的维度
    A   B   C   D
b   4   5   6   7
c   8   9  10  11
d  12  13  14  15
>>> df1.drop(['a', 'b'], axis = 0)
    A   B   C   D
c   8   9  10  11
d  12  13  14  15

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> # DataFrame的索引
>>> data = DataFrame(np.arange(16).reshape((4, 4)),
         index = ['a', 'b', 'c', 'd'],
         columns = ['A', 'B', 'C', 'D'])
>>> data
    A   B   C   D
a   0   1   2   3
b   4   5   6   7
c   8   9  10  11
d  12  13  14  15
>>> data['A']  #打印列
a     0
b     4
c     8
d    12
Name: A, dtype: int32
>>> data[['A', 'B']]   #花式索引
    A   B
a   0   1
b   4   5
c   8   9
d  12  13
>>> data[:2]   #切片索引,选择行
   A  B  C  D
a  0  1  2  3
b  4  5  6  7
>>> # 根据条件选择
>>> data
    A   B   C   D
a   0   1   2   3
b   4   5   6   7
c   8   9  10  11
d  12  13  14  15
>>> data[data.A > 5]  #根据条件选择行
    A   B   C   D
c   8   9  10  11
d  12  13  14  15
>>> data < 5    #打印True或者False
       A      B      C      D
a   True   True   True   True
b   True  False  False  False
c  False  False  False  False
d  False  False  False  False
>>> data[data < 5] = 0  #条件索引
>>> data
    A   B   C   D
a   0   0   0   0
b   0   5   6   7
c   8   9  10  11
d  12  13  14  15

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
    >>> df1 = DataFrame(np.arange(12).reshape((3,4)),columns=list("abcd"))
>>> df2 = DataFrame(np.arange(20).reshape((4,5)),columns=list("abcde"))
>>> df1
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
>>> df2
    a   b   c   d   e
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
>>> df1+df2
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0  11.0  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
>>> df1.add(df2,fill_value=0)  #为df1添加第3行和e这一列
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0  11.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
>>> df1.add(df2).fillna(0)  #按照正常方式将df1和df2相加，然后将NaN值填充为0
      a     b     c     d    e
0   0.0   2.0   4.0   6.0  0.0
1   9.0  11.0  13.0  15.0  0.0
2  18.0  20.0  22.0  24.0  0.0
3   0.0   0.0   0.0   0.0  0.0

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
    >>> frame = DataFrame(np.arange(12).reshape((4,3)),columns=list("bde"),
                 index=["Beijing","Shanghai","Shenzhen","Xiamen"])
>>> frame
        b   d   e
Beijing   0   1   2
Shanghai  3   4   5
Shenzhen  6   7   8
Xiamen    9  10  11
>>> frame.iloc[1]  # 获取某一行数据
b    3
d    4
e    5
Name: Shanghai, dtype: int32
>>> frame.index  #获取索引
Index(['Beijing', 'Shanghai', 'Shenzhen', 'Xiamen'], dtype='object')
>>> frame.loc["Xiamen"]    #根据行索引提取数据
b     9
d    10
e    11
Name: Xiamen, dtype: int32
>>> series = frame.iloc[0]
>>> series
b    0
d    1
e    2
Name: Beijing, dtype: int32
>>> frame - series
        b  d  e
Beijing   0  0  0
Shanghai  3  3  3
Shenzhen  6  6  6
Xiamen    9  9  9

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
    >>> frame = DataFrame(np.arange(12).reshape((4,3)),columns=list("bde"),
                 index=["Beijing","Shanghai","Shenzhen","Xiamen"])
>>> frame
        b   d   e
Beijing   0   1   2
Shanghai  3   4   5
Shenzhen  6   7   8
Xiamen    9  10  11
>>> f = lambda x : x.max() - x.min()  # 匿名函数
>>> frame.apply(f)  #apply默认第二个参数axis=0,作用于列方向上，axis=1时作用于行方向上
b    9
d    9
e    9
dtype: int64
>>> frame.apply(f,axis=1)
Beijing     2
Shanghai    2
Shenzhen    2
Xiamen      2
dtype: int64

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
    >>> frame = DataFrame(np.arange(12).reshape((4,3)),columns=list("bde"),
                 index=["Beijing","Shanghai","Shenzhen","Xiamen"])
>>> frame
        b   d   e
Beijing   0   1   2
Shanghai  3   4   5
Shenzhen  6   7   8
Xiamen    9  10  11
>>> f = lambda num : "%.2f"%num  #匿名函数
>>> # 将匿名函数f应用到frame中的每一元素中
>>> strFrame = frame.applymap(f)
>>> strFrame
             b      d      e
Beijing   0.00   1.00   2.00
Shanghai  3.00   4.00   5.00
Shenzhen  6.00   7.00   8.00
Xiamen    9.00  10.00  11.00
>>> frame.dtypes  # 获取DataFrame中每一列的数据类型
b    int32
d    int32
e    int32
dtype: object
>>> strFrame.dtypes
b    object
d    object
e    object
dtype: object
>>> # 将一个规则应用到某一列上
>>> frame["d"].map(lambda x :x+10)
Beijing     11
Shanghai    14
Shenzhen    17
Xiamen      20
Name: d, dtype: int64

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> series = Series(range(4),index=list("dabc"))
>>> series
d    0
a    1
b    2
c    3
dtype: int64
>>> series.sort_index() #索引按字母顺序排序
a    1
b    2
c    3
d    0
dtype: int64
>>> frame = DataFrame(np.arange(8).reshape((2,4)),
          index=["three","one"],
          columns=list("dabc"))
>>> frame
       d  a  b  c
three  0  1  2  3
one    4  5  6  7
>>> frame.sort_index()
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
>>> frame.sort_index(axis=1,ascending=False)
       d  c  b  a
three  0  3  2  1
one    4  7  6  5
>>> # 按照DataFrame中某一列的值进行排序
>>> df = DataFrame({"a":[4,7,-3,2],"b":[0,1,0,1]})
>>> df
   a  b
0  4  0
1  7  1
2 -3  0
3  2  1
>>> # 按照b这一列的数据值进行排序
>>> df.sort_values(by="b")
   a  b
0  4  0
2 -3  0
1  7  1
3  2  1

>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> obj=Series([7,-4,7,3,2,0,5])
>>> obj.rank()
0    6.5
1    1.0
2    6.5
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

>>> obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
>>> obj.rank(method='min')
0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
>>> obj.rank(method='max')
0    7.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> frame=DataFrame({'b':[3,1,5,2],'a':[8,4,3,7],'c':[2,7,9,4]})
>>> frame
   b  a  c
0  3  8  2
1  1  4  7
2  5  3  9
3  2  7  4
>>> frame.rank(axis=1)  # axis=0时作用于列方向上，axis=1时作用于行方向上
     b    a    c
0  2.0  3.0  1.0
1  1.0  2.0  3.0
2  2.0  1.0  3.0
3  1.0  3.0  2.0

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series,DataFrame
>>> dict_obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'],
        'key2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
        'data1': np.random.randn(8),
        'data2': np.random.randn(8)}
>>> df_obj = DataFrame(dict_obj)
>>> df_obj
  key1   key2     data1     data2
0    a    one -0.026042  0.051420
1    b    one -0.214902 -1.245808
2    a    two -0.626813  0.313240
3    b  three -1.074137  0.245969
4    a    two  0.106360 -0.344038
5    b    two -0.719663 -0.877795
6    a    one -0.248008 -0.650183
7    a  three  0.861269  1.388312
>>> df_obj.groupby('key1')
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000000037E93D0>
>>> type(df_obj.groupby('key1'))
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
>>> df_obj['data1'].groupby(df_obj['key1'])
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000000000B4E7D00>
>>> type(df_obj['data1'].groupby(df_obj['key1']))
<class 'pandas.core.groupby.generic.SeriesGroupBy'>

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series,DataFrame
>>> dict_obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'],
        'key2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
        'data1': np.random.randn(8),
        'data2': np.random.randn(8)}
>>> df_obj = DataFrame(dict_obj)
>>> df_obj
  key1   key2     data1     data2
0    a    one -0.026042  0.051420
1    b    one -0.214902 -1.245808
2    a    two -0.626813  0.313240
3    b  three -1.074137  0.245969
4    a    two  0.106360 -0.344038
5    b    two -0.719663 -0.877795
6    a    one -0.248008 -0.650183
7    a  three  0.861269  1.388312
>>> grouped1 = df_obj.groupby('key1')
>>> grouped1.mean()
         data1     data2
key1                    
a     0.013353  0.151750
b    -0.669567 -0.625878
>>> grouped2 = df_obj['data1'].groupby(df_obj['key1'])
>>>grouped2.mean()
key1
a    0.013353
b   -0.669567
Name: data1, dtype: float64
>>> grouped1.size()  #返回每个分组的元素个数
key1
a    5
b    3
dtype: int64
>>> grouped2.size()  #返回每个分组的元素个数
key1
a    5
b    3
Name: data1, dtype: int64
>>> df_obj.groupby([df_obj['key1'], df_obj['key2']]).size()
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
>>> grouped3 = df_obj.groupby(['key1', 'key2'])
>>> grouped3.size()
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
>>> grouped3.mean()
               data1     data2
key1 key2
a    one   -0.137025 -0.299382
     three  0.861269  1.388312
     two   -0.260226 -0.015399
b    one   -0.214902 -1.245808
     three -1.074137  0.245969
     two   -0.719663 -0.877795

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series,DataFrame
>>> dict_obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'],
        'key2' : ['one', 'one', 'two', 'three','two', 'two', 'one', 'three'],
        'data1': np.random.randn(8),
        'data2': np.random.randn(8)}
>>> df_obj = DataFrame(dict_obj)
>>> df_obj
  key1   key2     data1     data2
0    a    one -0.026042  0.051420
1    b    one -0.214902 -1.245808
2    a    two -0.626813  0.313240
3    b  three -1.074137  0.245969
4    a    two  0.106360 -0.344038
5    b    two -0.719663 -0.877795
6    a    one -0.248008 -0.650183
7    a  three  0.861269  1.388312
>>> self_def_key = [0, 1, 2, 3, 3, 4, 5, 7]
>>> df_obj.groupby(self_def_key).size()
0    1
1    1
2    1
3    2
4    1
5    1
7    1

>>> import pandas as pd
>>> from pandas import DataFrame
>>> frame=DataFrame({'b':[3,1,5,2],'a':[8,4,3,7],'c':[2,7,9,4]})
>>> frame
   b  a  c
0  3  8  2
1  1  4  7
2  5  3  9
3  2  7  4
>>> frame.shape
(4, 3)
>>> frame.shape[0]
4
>>> frame.shape[1]
3

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import DataFrame
>>> df= DataFrame({'id':[1,np.nan,3,4],'name':['asx',np.nan,'wes','asd'],'score':[78,90,np.nan,88]},index=list('abcd'))
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      3 non-null      float64
 1   name    3 non-null      object 
 2   score   3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 128.0+ bytes

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import DataFrame
>>> info_nums = DataFrame({'num': np.random.randint(1, 50, 11)})
>>> info_nums
    num
0    43
1     7
2    13
3    47
4    23
5    10
6    44
7     2
8    31
9    21
10   47
>>> info_nums['num_bins'] = pd.cut(x=info_nums['num'], bins=[1, 25, 50])
>>> info_nums
    num  num_bins
0    43  (25, 50]
1     7   (1, 25]
2    13   (1, 25]
3    47  (25, 50]
4    23   (1, 25]
5    10   (1, 25]
6    44  (25, 50]
7     2   (1, 25]
8    31  (25, 50]
9    21   (1, 25]
10   47  (25, 50]
>>> info_nums['num_bins'].unique()
[(25, 50], (1, 25]]
Categories (2, interval[int64]): [(1, 25] < (25, 50]]

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import DataFrame
>>> info_nums = DataFrame({'num': np.random.randint(1, 10, 7)})
>>> info_nums
   num
0    4
1    5
2    6
3    8
4    6
5    2
6    3
>>> info_nums['nums_labels'] = pd.cut(x=info_nums['num'], bins=[1, 7, 10], labels=['Lows', 'Highs'], right=False)
>>> info_nums
   num nums_labels
0    4        Lows
1    5        Lows
2    6        Lows
3    8       Highs
4    6        Lows
5    2        Lows
6    3        Lows
>>> info_nums['nums_labels'].unique()
['Lows', 'Highs']
Categories (2, object): ['Lows' < 'Highs']

>>> import numpy as np
>>> import pandas as pd
>>> from pandas import Series,DataFrame
>>> df=DataFrame([[1.3,np.nan],[6.2,-3.4],[np.nan,np.nan],[0.65,-1.4]],columns=['one','two'])
>>> df.sum()  #计算每列的和，默认排除NaN
one    8.15
two   -4.80
dtype: float64
>>> df.sum(axis=1)  #计算每行的和，默认排除NaN
0    1.30
1    2.80
2    0.00
3   -0.75
dtype: float64
>>> #计算每行的和，设置skipna=False，NaN参与计算，结果仍为NaN
>>> df.sum(axis=1,skipna=False)
0     NaN
1    2.80
2     NaN
3   -0.75
dtype: float64
>>> df.mean(axis=1)
0    1.300
1    1.400
2      NaN
3   -0.375
dtype: float64
>>> df.mean(axis=1,skipna=False)  #计算每行的平均值，NaN参与运算
0      NaN
1    1.400
2      NaN
3   -0.375
dtype: float64
>>> df.cumsum()   #求样本值的累计和
    one  two
0  1.30  NaN
1  7.50 -3.4
2   NaN  NaN
3  8.15 -4.8
>>> df.describe()    #针对列计算汇总统计
            one       two
count  3.000000  2.000000
mean   2.716667 -2.400000
std    3.034112  1.414214
min    0.650000 -3.400000
25%    0.975000 -2.900000
50%    1.300000 -2.400000
75%    3.750000 -1.900000
max    6.200000 -1.400000

>>> import pandas as pd
>>> from pandas import Series
    >>> s = Series([3,3,1,2,4,3,4,6,5,6])
>>> #判断Series中的值是否重复,False表示重复
>>> print(s.is_unique)
False
>>> #输出Series中不重复的值,返回值没有排序，返回值的类型为数组
>>> s.unique()
array([3, 1, 2, 4, 6, 5], dtype=int64)
>>> #统计Series中重复值出现的次数,默认是按出现次数降序排序
>>> s.value_counts()
3    3
4    2
6    2
1    1
2    1
5    1
dtype: int64
>>> #按照重复值的大小排序输出频率
>>> s.value_counts(sort=False)
1    1
2    1
3    3
4    2
5    1
6    2
dtype: int64

>>> import pandas as pd
>>> from pandas import Series,DataFrame
    >>> s = Series([6,6,7,2,2])
>>> s
0    6
1    6
2    7
3    2
4    2
dtype: int64
>>> #判断矢量化集合的成员资格,返回一个布尔类型的Series
>>> s.isin([6])
0     True
1     True
2    False
3    False
4    False
dtype: bool
>>> type(s.isin([6]))
<class 'pandas.core.series.Series'>
>>> #通过成员资格方法选取Series中的数据子集
>>> s[s.isin([6])]
0    6
1    6
dtype: int64
>>> data = [[4,3,7],[3,2,5],[7,3,6]]
>>> df = DataFrame(data,index=["a","b","c"],columns=["one","two","three"])
>>> df
   one  two  three
a    4    3      7
b    3    2      5
c    7    3      6
>>> #返回一个布尔型的DataFrame
>>> df.isin([2])
     one    two  three
a  False  False  False
b  False   True  False
c  False  False  False
>>> #选取DataFrame中值为2的数，其他的为NaN
>>> df[df.isin([2])]
   one  two  three
a  NaN  NaN    NaN
b  NaN  2.0    NaN
c  NaN  NaN    NaN
>>> #选取DataFrame中值为2的数，将NaN用0进行填充
>>> df[df.isin([2])].fillna(0)
   one  two  three
a  0.0  0.0    0.0
b  0.0  2.0    0.0
c  0.0  0.0    0.0

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series,DataFrame
>>> df = DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
>>> df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
>>> df['one'].isnull()
a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool
>>> df['one'].notnull()
a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series,DataFrame
>>> df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'],columns=['one','two', 'three'])
>>> df = df.reindex(['a', 'b', 'c'])
>>> df
        one       two     three
a -0.963024 -0.284216 -1.762598
b       NaN       NaN       NaN
c  0.677290  0.320812 -0.145247
>>> df.fillna(0)  #用0填充缺失值
        one       two     three
a -0.963024 -0.284216 -1.762598
b  0.000000  0.000000  0.000000
c  0.677290  0.320812 -0.145247
>>> df.fillna(method='pad')  #填充时和前一行的数据相同
        one       two     three
a -0.963024 -0.284216 -1.762598
b -0.963024 -0.284216 -1.762598
c  0.677290  0.320812 -0.145247
>>> df.fillna(method='backfill')  #填充时和后一行的数据相同
        one       two     three
a -0.963024 -0.284216 -1.762598
b  0.677290  0.320812 -0.145247
c  0.677290  0.320812 -0.145247

>>> import pandas as pd
>>> import numpy as np
>>> from pandas import Series,DataFrame
>>> df = DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f','h'],columns=['one', 'two', 'three'])
>>> df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
>>> df
        one       two     three
a -0.249220 -0.003033 -0.615404
b       NaN       NaN       NaN
c  0.034787 -0.056103 -0.389375
d       NaN       NaN       NaN
e -0.453844  1.131537  0.273852
f -0.895511 -0.306457 -0.135208
g       NaN       NaN       NaN
h  0.701194  0.556521 -0.341591
>>> df.dropna()  #默认情况下，axis = 0，即在行上应用
        one       two     three
a -0.249220 -0.003033 -0.615404
c  0.034787 -0.056103 -0.389375
e -0.453844  1.131537  0.273852
f -0.895511 -0.306457 -0.135208
h  0.701194  0.556521 -0.341591
>>> df.dropna(axis=1)  # axis = 1时在列上应用
Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]
>>> # 可以用一些具体的值取代一个通用的值
>>> df = DataFrame({'one':[1,2,3,4,5,300],'two':[200,0,3,4,5,6]})
>>> df
   one  two
0    1  200
1    2    0
2    3    3
3    4    4
4    5    5
5  300    6
>>> df.replace({200:10,300:60})
   one  two
0    1   10
1    2    0
2    3    3
3    4    4
4    5    5
5   60    6

> pip install matplotlib

>>> import matplotlib.pyplot as plt
>>> plt.plot([1,2,3],[4,8,5])
>>> plt.show()

>>> x = [1,2,3]    #第1条折线的横坐标
>>> y = [4,8,5]    #第1条折线的纵坐标
>>> x2 = [1,2,3]       #第2条折线的横坐标
>>> y2 = [11,15,13]    #第2条折线的纵坐标
>>> plt.plot(x, y, label='First Line')    #绘制第1条折线，给折线一个名称'First Line'
>>> plt.plot(x2, y2, label='Second Line')  #绘制第2条折线，给折线一个名称'Second Line'
>>> plt.xlabel('Plot Number')    #给横坐标轴添加名称
>>> plt.ylabel('Important var')    #给纵坐标轴添加名称
>>> plt.title('Graph Example\nTwo lines')  #添加标题
>>> plt.legend()   #添加图例
>>> plt.show()   #显示到屏幕上（如图8-2所示）

>>> plt.bar([1,3,5,7,9],[6,3,8,9,2], label="First Bar")   #第1个数据系列
>>> #下面的color='g'，表示设置颜色为绿色
>>> plt.bar([2,4,6,8,10],[9,7,3,6,7], label="Second Bar", color='g')  #第2个数据系列
>>> plt.legend()  #添加图例
>>> plt.xlabel('bar number')   #给横坐标轴添加名称
>>> plt.ylabel('bar height')   #给纵坐标轴添加名称
>>> plt.title('Bar Example\nTwo bars!')  #添加标题
>>> plt.show()  #显示到屏幕上（如图8-3所示）

>>> population_ages = [21,57,61,47,25,21,33,41,41,5,96,103,108,
           121,122,123,131,112,114,113,82,77,67,56,46,44,45,47]
>>> bins = [0,10,20,30,40,50,60,70,80,90,100,110,120,130]
>>> plt.hist(population_ages, bins, histtype='bar', rwidth=0.8)
>>> plt.xlabel('x')
>>> plt.ylabel('y')
>>> plt.title('Graph Example\n Histogram')
>>> plt.show()  #显示到屏幕上（如图8-4所示）

>>> slices = [7,2,2,13]  #即activities分别占比7/24,2/,2/24,13/24
>>> activities = ['sleeping','eating','working','playing']
>>> cols = ['c','m','r','b']
>>> plt.pie(slices,
          labels=activities,
colors=cols,
startangle=90,
          shadow= True,
          explode=(0,0.1,0,0),
          autopct='%1.1f%%')
>>> plt.title('Graph Example\n Pie chart')
>>> plt.show()    #显示到屏幕上（如图8-5所示）

>>> import pandas
>>> food_info = pandas.read_csv("D:\\food_info.csv")
>>> #使用head()方法读取前几行数据，参数为空时默认展示5行数据，可以传入其他数字，如4、9等；
>>> food_info.head()
>>> #使用tail()方法倒着从后读取后几行数据，参数为空时默认展示5行数据，可以传入其他数字，如4、9等；
>>> food_info.tail()
>>> #使用columns方法，打印列名，作用是可以看到每一列的数据代表的含义
>>> food_info.columns
>>> #使用shape方法，打印数据的维度，一共有几行几列
>>> food_info.shape
>>> #也可以使用切片操作，例如取第3-5行的数据
>>> food_info.loc[3:5]
>>> #也可以传进去一个列表，例如打印 4，6，9 行的数据
>>> food_info.loc[[4,6,9]]
>>> #通过列名取一列数据
>>> food_info['Water_(g)']
>>> #通过好几个列名取好几列数据，参数是一个含有多个列名的列表
>>> food_info[['Water_(g)','Ash_(g)']]
>>> #找出以“(g)”结尾的列，取前3行数据打印出来
>>> col_names = food_info.columns.tolist()
>>> gram_columns = []
>>> for i in col_names:
    if i.endswith("(g)"):
        gram_columns.append(i)
>>> gram_df = food_info[gram_columns]
>>> print(gram_df.head(3))

>>> iron_gram = food_info["Iron_(mg)"]/1000
>>> food_info["Iron_(g)"] = iron_gram
>>> food_info.shape
>>> #对某一列进行归一化操作，比如列中的每个元素都除以该列的最大值
>>> normalized_fat = food_info["Lipid_Tot_(g)"]/ food_info["Lipid_Tot_(g)"].max()
>>> print(normalized_fat)
>>> food_info.sort_values("Sodium_(mg)",inplace = True, ascending = False)
>>> print(food_info['Sodium_(mg)'])

> pip install xlrd
> pip install openpyxl

>>> import numpy as np
>>> import pandas as pd
>>> index=pd.read_excel('D:\\baidu\\baidu_index.xls')
>>> # 处理缺失值
>>> index = index.fillna(0)
>>> index['date'].head()
0   2018-12-01
1   2018-12-02
2   2018-12-03
3   2018-12-04
4   2018-12-05
Name: date, dtype: datetime64[ns]
>>> index['date']
0      2018-12-01
1      2018-12-02
2      2018-12-03
3      2018-12-04
4      2018-12-05
          ...
Name: date, Length: 6344, dtype: datetime64[ns]
>>> index['date'] = index['date'].dt.strftime('%B')
>>> index['date']
0       December
1       December
2       December
3       December
4       December
          ...
Name: date, Length: 6344, dtype: object
>>> index['keyword']
         ...   
6339    T-cross
6340    T-cross
6341    T-cross
6342    T-cross
6343    T-cross
Name: keyword, Length: 6344, dtype: object
>>> index['keyword'] = index['keyword'].apply(lambda x: x.strip(' \r\n\t').upper())
>>> index['keyword']
         ...   
6339    T-CROSS
6340    T-CROSS
6341    T-CROSS
6342    T-CROSS
6343    T-CROSS
Name: keyword, Length: 6344, dtype: object
>>> new_index_mean = index.groupby(['keyword','date'])['index'].sum()
>>> new_index_mean
keyword   date    
IX25      April       29144.0
          December    32422.0
          February    28511.0
          January     32204.0
          June          882.0
                       ...   
雪铁龙C3-XR  June          184.0
          March        9967.0
          May          6419.0
          November     6346.0
          October      7757.0
Name: index, Length: 234, dtype: float64

>>> import matplotlib.pyplot as plt
>>> import numpy as np
>>> import pandas as pd
>>> # 读取数据
>>> movie = pd.read_csv("D:\\IMDB-Movie-Data.csv")
>>> #查看前5条数据
>>> movie.head()
>>> #求出电影评分的平均分
>>> movie['Rating'].mean()
>>> np.unique(movie['Director']).shape[0]
>>> # 创建画布
>>> plt.figure(figsize=(20, 8), dpi=100)
>>> # 绘制图像
>>> plt.hist(movie["Rating"].values, bins=20)
>>> # 添加刻度
>>> max_ = movie["Rating"].max()
>>> min_ = movie["Rating"].min()
>>> t1 = np.linspace(min_, max_, num=21)
>>> plt.xticks(t1)
>>> # 添加网格
>>> plt.grid()
>>> # 显示
>>> plt.show()   #显示到屏幕上（如图8-7所示）
>>> # 查看电影时长
>>> runtime_data = movie["Runtime (Minutes)"]
>>> # 创建画布
>>> plt.figure(figsize=(20,8),dpi=80)
>>> # 求出最大值和最小值
>>> max_ = runtime_data.max()
>>> min_ = runtime_data.min()
>>> num_bin = (max_-min_)//5
>>> # 绘制图像
>>> plt.hist(runtime_data,num_bin)
>>> # 添加刻度
>>> plt.xticks(range(min_,max_+5,5))
>>> # 添加网格
>>> plt.grid()
>>> plt.show()    #显示到屏幕上（如图8-8所示）
>>> # 查看评分平均数
>>> movie["Rating"].mean()
>>> # 查看导演人数
>>> np.unique(movie["Director"]).shape[0]
>>> len(set(movie["Director"].tolist()))
>>> # 查看演员人数
>>> num = movie["Actors"].str.split(',').tolist()
>>> actor_nums = [j for i in num for j in i]
>>> len(set(actor_nums))
>>> movie["Genre"].head()
0     Action,Adventure,Sci-Fi
1    Adventure,Mystery,Sci-Fi
2             Horror,Thriller
3     Animation,Comedy,Family
4    Action,Adventure,Fantasy
Name: Genre, dtype: object
>>> # 将'Genre'转化为列表
>>> temp_list = [i for i in movie['Genre']]
>>> # 去除分隔符，变成二维数组
>>> temp_list = [i.split(sep=',') for i in movie['Genre']]
>>> # 提取二维数组中元素
>>> [i for j in temp_list for i in j]
>>> # 去重，得到所有电影类别
>>> array_list = np.unique([i for j in temp_list for i in j])
>>> #创建一个全为0的DataFrame，列索引置为电影的分类
>>> array_list.shape
>>> movie.shape
>>> np.zeros((movie.shape[0], array_list.shape[0]))
>>> genre_zero = pd.DataFrame(np.zeros((movie.shape[0], array_list.shape[0])),
              columns=array_list,
              index=movie["Title"])
>>> #遍历每一部电影，DataFrame中把分类出现的列的值置为1
>>> for i  in range(movie.shape[0]):
    genre_zero.iloc[i, genre_zero.columns.get_indexer(temp_list[i])] = 1
>>> genre_zero
>>> # 对每个分类求和
>>> genre_zero.sum(axis=0)
>>> # 排序、画图
>>> new_zeros = genre_zero.sum(axis=0)
>>> new_zeros
>>> genre_count = new_zeros.sort_values(ascending=False)
>>> x_ = genre_count.index
>>> y_ = genre_count.values
>>> plt.figure(figsize=(20,8),dpi=80)
>>> plt.bar(range(len(x_)),y_,width=0.4,color="orange")
>>> plt.xticks(range(len(x_)),x_)
>>> plt.show()    #显示到屏幕上（如图8-10所示）

> pip install numpy
> pip install pandas
> pip install matplotlib
> pip list

>>> import numpy as np
>>> import pandas as pd
>>> import matplotlib.pyplot as plt
>>> from datetime import datetime
>>> import time
>>> tag=pd.read_csv("C:\\Python38\\dataset\\tag.csv")
>>> trd=pd.read_csv("C:\\Python38\\dataset\\tradition.csv")
>>> beh=pd.read_csv("C:\\Python38\\dataset\\behavior.csv")
>>> # tag数据
>>> tag.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39923 entries, 0 to 39922
Data columns (total 43 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   id                               39923 non-null  object
 1   flag                             39923 non-null  int64 
 2   gdr_cd                           39923 non-null  object
 3   age                              39923 non-null  int64 
 4   mrg_situ_cd                      39923 non-null  object
 5   edu_deg_cd                       27487 non-null  object
 6   acdm_deg_cd                      39922 non-null  object
 7   deg_cd                           18960 non-null  object
 8   job_year                         39923 non-null  object
 9   ic_ind                           39923 non-null  object
 ……
35  l1y_crd_card_csm_amt_dlm_cd      39923 non-null  object
 36  atdd_type                        16266 non-null  object
 37  perm_crd_lmt_cd                  39923 non-null  int64 
 ……
dtypes: int64(11), object(32)
memory usage: 13.1+ MB

>>> # trd数据
>>> trd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1367211 entries, 0 to 1367210
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   id           1367211 non-null  object 
 1   flag         1367211 non-null  int64  
 2   Dat_Flg1_Cd  1367211 non-null  object 
 3   Dat_Flg3_Cd  1367211 non-null  object 
 4   Trx_Cod1_Cd  1367211 non-null  int64  
 5   Trx_Cod2_Cd  1367211 non-null  int64  
 6   trx_tm       1367211 non-null  object 
 7   cny_trx_amt  1367211 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 83.4+ MB
>>> # beh数据
>>> beh.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934282 entries, 0 to 934281
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       934282 non-null  object
 1   flag     934282 non-null  int64 
 2   page_no  934282 non-null  object
 3   page_tm  934282 non-null  object
dtypes: int64(1), object(3)
memory usage: 28.5+ MB
>>> total=tag.shape[0]   # shape[0]用于获取数据的行数
>>> total
39923
>>> tradition_total=trd.groupby('id').count().shape[0]
>>> tradition_total
31993
>>> behavior_total=beh.groupby('id').count().shape[0]
>>> behavior_total
11913
>>> print(tradition_total/total)   #大概80%的用户有交易记录
0.8013676326929339
>>> print(behavior_total/total)    #仅有大约30%的用户有APP行为数据
0.2983994188813466
>>> x=['total','tradition_total','behavior_total']
>>> y=[total,tradition_total,behavior_total]
>>> plt.figure(figsize=(8,8))
<Figure size 800x800 with 0 Axes>
>>> plt.bar(x,y,width=0.3)
<BarContainer object of 3 artists>
>>> plt.show()    #在屏幕上显示图（如图8-11所示）

>>> # 用柱状图统计各个特征情况
>>> def feature_bar(feature,data,figsize=(8,8)):
    feat_data=data[feature].value_counts()
    plt.figure(figsize=figsize)
    plt.bar(feat_data.index.values,feat_data.values,color='red',alpha=0.5)
    plt.title('value_counts of '+feature)
    plt.ylabel('counts')
    plt.xlabel('value')
    plt.show()

>>> feature_bar('age',tag)
>>> # 对年龄段做分桶
>>> bins=[i*10 for i in range(1,10)]
>>> group_names=['[10,20)','[20,30)','[30,40)','[40,50)','[50,60)','[60,70)','[70,80)','[80,90]']
>>> catagories=pd.cut(tag['age'],bins,labels=group_names)
>>> tag['age']=catagories
>>>  #查看有缺失值的字段的情况
>>> print(tag['edu_deg_cd'].value_counts())
F     6917
C     6695
B     6672
K     2312
Z     2097
G      953
A      889
\N     736
~      108
M       54
L       33
D       20
J        1
Name: edu_deg_cd, dtype: int64
>>> tag['acdm_deg_cd'].value_counts()
G     13267
31    10419
30     8229
Z      4469
F      1635
C      1064
\N      736
D       103
Name: acdm_deg_cd, dtype: int64
>>> print(tag['deg_cd'].value_counts())
~     17050
\N      736
A       543
B       332
Z       171
C       118
D        10
Name: deg_cd, dtype: int64
>>> feature_bar('edu_deg_cd',tag)
>>> feature_bar('acdm_deg_cd',tag)
>>> feature_bar('deg_cd',tag)
>>> feature_bar('atdd_type',tag)
>>> tag['edu_deg_cd'].fillna('~',inplace=True)
>>> tag['acdm_deg_cd'].fillna(r'\N',inplace=True)
>>> tag['deg_cd'].fillna('~',inplace=True)
>>> tag['atdd_type'].fillna(r'\N',inplace=True)

>>> # 将int类型的特征的‘\N’进行处理，原则是不要干扰到原来的比例，将\N当做一个新的类型
>>> # columns1将\N转成0是因为字段本身有特殊的一类-1，需要将\N与-1区分开来，故将其置为0
>>> columns1=['frs_agn_dt_cnt','fin_rsk_ases_grd_cd','confirm_rsk_ases_lvl_typ_cd', 'cust_inv_rsk_endu_lvl_cd','tot_ast_lvl_cd','pot_ast_lvl_cd','hld_crd_card_grd_cd']
>>> for i in columns1:
    tag[i].replace({r'\N':0},inplace=True)
    # 转成int
    tag[i]=tag[i].astype(int)   
>>> # columns2将\N转成-1，思路其实一样，为了将\N与数据区分开来，字段里表示数字的含有0，故将\N转为-1
>>> columns2=['job_year','l12mon_buy_fin_mng_whl_tms',
'l12_mon_fnd_buy_whl_tms','l12_mon_insu_buy_whl_tms', 
l12_mon_gld_buy_whl_tms','ovd_30d_loan_tot_cnt',
'his_lng_ovd_day','l1y_crd_card_csm_amt_dlm_cd']
>>> for i in columns2:
    tag[i].replace({r'\N':-1},inplace=True)
    # 转成int
    tag[i]=tag[i].astype(int)   
>>> # 转成str类型
>>> columns3=['gdr_cd','mrg_situ_cd','edu_deg_cd','acdm_deg_cd','deg_cd',
'ic_ind','fr_or_sh_ind', dnl_mbl_bnk_ind','dnl_bind_cmb_lif_ind','hav_car_grp_ind',
'hav_hou_grp_ind', l6mon_agn_ind','vld_rsk_ases_ind','loan_act_ind',
'crd_card_act_ind','atdd_type','age']
>>> for i in columns3:
    # 转成str
    tag[i]=tag[i].astype(str)
最后，保存一下经过预处理的tag数据：
>>> # 保存补充缺失值后的数据
>>> completed_tag=tag
>>> completed_tag.to_csv("C:\\Python38\\dataset\\completed_tag.csv")

>>> # 将交易时间trx_tm特征进行提取，提取出年月日周等信息
>>> trd['date']=trd['trx_tm'].apply(lambda x: x[0:10])
>>> trd['month']=trd['trx_tm'].apply(lambda x: int(x[5:7]))
>>> trd['day_1']=trd['trx_tm'].apply(lambda x: int(x[8:10]))
>>> trd['hour']=trd['trx_tm'].apply(lambda x: int(x[11:13]))
>>> trd['trx_tm']=trd['trx_tm'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
>>> trd['day']=trd['trx_tm'].apply(lambda x: x.dayofyear)
>>> trd['weekday']=trd['trx_tm'].apply(lambda x: x.weekday())
>>> trd['isWeekend']=trd['weekday'].apply(lambda x: 1 if x in [5, 6] else 0)
>>> trd['trx_tm']=trd['trx_tm'].apply(lambda x: int(time.mktime(x.timetuple())))
>>> # 保存补充完后的数据，用于数据分析
>>> completed_trd=trd
>>> completed_trd.to_csv("C:\\Python38\\dataset\\completed_trd.csv")

厦大数据库实验室博客

代码-第8章使用pandas进行数据清洗-林子雨编著《数据采集与预处理》

第8章使用pandas进行数据清洗

厦大数据库实验室博客

第8章 使用pandas进行数据清洗

第8章使用pandas进行数据清洗