• Pandas教程(一)


    参考资料:清华计算机博士带你学-Python金融量化分析


     

    目录

    demo1-Series使用特性

    demo2-Series整数索引问题

    demo3-Series数据对齐

    demo4-Series缺失值的处理

    demo5-DataFrame的创建

    demo6-DataFrame常见属性

    demo7-DataFrame索引与切片

    demo8-DataFrame数据对齐与缺失数据处理

    demo9-Pandas常用函数

    demo10-时间对象

    demo11-时间序列

    demo12-Pandas文件操作


    demo1-Series使用特性

    1. #Pandas-数据分析核心包
    2. import pandas as pd
    3. import numpy as np
    4. #demo1-Series使用特性
    5. #数组与字典结合体
    6. print("*****Series支持数组的特性:索引*****")
    7. sr = pd.Series([2,3,4,5],index=["a","b","c","d"]) #array_like创建Series
    8. print(sr)
    9. print(sr[0]) #按下标索引
    10. print(sr["a"]) #按标签索引
    11. print(sr+2) #与标量运算
    12. print(sr+2*sr) #两个Series运算
    13. print(sr[:2]) #切片
    14. print(sr[[1,3]]) #花式索引
    15. print(np.sqrt(sr)) #通用函数
    16. print(sr[sr>3]) #布尔型索引
    17. print("*****Series支持字典的特性:标签*****")
    18. sr = pd.Series({"a":1,"b":2}) #使用字典创建Series
    19. print(sr)
    20. print(sr["a"],sr["b"])
    21. print(sr[["a","b"]]) #花式索引
    22. print("a" in sr,"c" in sr) #in运算
    23. for each in sr:
    24. print(each) #与内置dict不同,打印的是value
    25. print(sr.index,sr.index[0])
    26. print(sr.values,sr.values[1])

     

    *****Series支持数组的特性:下标*****
    a    2
    b    3
    c    4
    d    5
    dtype: int64
    2
    2
    a    4
    b    5
    c    6
    d    7
    dtype: int64
    a     6
    b     9
    c    12
    d    15
    dtype: int64
    a    2
    b    3
    dtype: int64
    b    3
    d    5
    dtype: int64
    a    1.414214
    b    1.732051
    c    2.000000
    d    2.236068
    dtype: float64
    c    4
    d    5
    dtype: int64
    *****Series支持字典的特性:标签*****
    a    1
    b    2
    dtype: int64
    1 2
    a    1
    b    2
    dtype: int64
    True False
    1
    2
    Index(['a', 'b'], dtype='object') a
    [1 2] 2

    demo2-Series整数索引问题

    1. #demo2-Series整数索引问题
    2. #iloc与loc
    3. sr = pd.Series(np.arange(20))
    4. srNew = sr[10:].copy()
    5. print(srNew,sr[10]) #默认是标签
    6. print(srNew.iloc[9]) #按索引
    7. print(srNew.loc[10]) #按标签
    10    10
    11    11
    12    12
    13    13
    14    14
    15    15
    16    16
    17    17
    18    18
    19    19
    dtype: int32 10
    19
    10

    demo3-Series数据对齐

    1. print("*****sr1与sr2等长*****")
    2. sr1 = pd.Series([12,23,34],index=["c","a","d"])
    3. sr2 = pd.Series([11,20,10],index=["d","c","a"])
    4. print(sr1+sr2) #按标签index相加
    5. print("*****sr1与sr2不等长*****")
    6. sr1 = pd.Series([12,23,34],index=["c","a","d"])
    7. sr2 = pd.Series([11,20,10,21],index=["d","c","a","b"])
    8. print(sr1+sr2) #按标签index相加
    9. print("*****sr1与sr2不等长:fill_value*****")
    10. sr1 = pd.Series([12,23,34],index=["c","a","d"])
    11. sr2 = pd.Series([11,20,10,21],index=["d","c","a","b"])
    12. print(sr1.add(sr2,fill_value=0)) #注意观察"b"标签所对应的值

     

    *****sr1与sr2等长*****
    a    33
    c    32
    d    45
    dtype: int64
    *****sr1与sr2不等长*****
    a    33.0
    b     NaN
    c    32.0
    d    45.0
    dtype: float64
    *****sr1与sr2不等长:fill_value*****
    a    33.0
    b    21.0
    c    32.0
    d    45.0
    dtype: float64

    demo4-Series缺失值的处理

    1. #demo4-Series缺失值的处理
    2. sr1 = pd.Series([12,23,34],index=["c","a","d"])
    3. sr2 = pd.Series([11,20,10,21],index=["d","c","a","b"])
    4. sr3 = sr1 + sr2
    5. print("*****删除缺失值的3种方法*****")
    6. print(sr3)
    7. print(sr3.isnull())
    8. print(sr3.notnull())
    9. print(sr3[~sr3.isnull()]) #方法一
    10. print(sr3[sr3.notnull()]) #方法二
    11. print(sr3.dropna()) #方法三
    12. print(sr3) #不在原始数据上改
    13. print("*****填充缺失值*****")
    14. print(sr3.fillna(0)) #填充9
    15. print(sr3.fillna(sr3.mean())) #填充平均值
    *****删除缺失值的3种方法*****
    a    33.0
    b     NaN
    c    32.0
    d    45.0
    dtype: float64
    a    False
    b     True
    c    False
    d    False
    dtype: bool
    a     True
    b    False
    c     True
    d     True
    dtype: bool
    a    33.0
    c    32.0
    d    45.0
    dtype: float64
    a    33.0
    c    32.0
    d    45.0
    dtype: float64
    a    33.0
    c    32.0
    d    45.0
    dtype: float64
    a    33.0
    b     NaN
    c    32.0
    d    45.0
    dtype: float64
    *****填充缺失值*****
    a    33.0
    b     0.0
    c    32.0
    d    45.0
    dtype: float64
    a    33.000000
    b    36.666667
    c    32.000000
    d    45.000000
    dtype: float64

     #Series小结
    #字典与数字的结合体:下标索引 + 标签访问
    #整数索引loc与iloc
    #数据对齐/ sr1.add(sr2,fill_value=0)
    #缺失数据处理:①dropna ②fillna(0) 

    demo5-DataFrame的创建

    1. #demo5-DataFrame的创建
    2. #表格型数据结构 ,可以看作由Series组成的字典,共用一个索引
    3. print(pd.DataFrame({"one":[1,2,3],"two":[4,5,6]},index=["a","b","c"])) #统一指定index
    4. #一列为同一个类型,由于存在nan,因此one列变为浮点型 采用Series分别指定index
    5. print(pd.DataFrame({"one":pd.Series([1,2,3],index=["a","b","c"]),"two":pd.Series([2,1,3,4],index=["a","b","c","d"])}))
    6. #pd.read_csv() #读取文件
    7. #df.to_csv() #保存文件
    8. data = pd.read_csv("600519.csv",index_col="trade_date")
    9. data
      one  two
    a    1    4
    b    2    5
    c    3    6
       one  two
    a  1.0    2
    b  2.0    1
    c  3.0    3
    d  NaN    4

    demo6-DataFrame常见属性

    1. #demo6-DataFrame常见属性
    2. #index 标签
    3. #T 转置
    4. #columns 列名
    5. #values 数值值
    6. #describe 快速统计
    7. df = pd.DataFrame({"one":[1,2,3],"two":[4,5,6]},index=["a","b","c"])
    8. print(df)
    9. print(df.index) #获取标签
    10. print(df.T) #转置
    11. print(df.columns) #列名/列索引
    12. print(df.values,type(df.values),df.values.shape) #获取数组值->返回二维数组
    13. print(df.describe())
    one  two
    a    1    4
    b    2    5
    c    3    6
    Index(['a', 'b', 'c'], dtype='object')
         a  b  c
    one  1  2  3
    two  4  5  6
    Index(['one', 'two'], dtype='object')
    [[1 4]
     [2 5]
     [3 6]]  (3, 2)
           one  two
    count  3.0  3.0
    mean   2.0  5.0
    std    1.0  1.0
    min    1.0  4.0
    25%    1.5  4.5
    50%    2.0  5.0
    75%    2.5  5.5
    max    3.0  6.0

    demo7-DataFrame索引与切片

    1. #demo7-DataFrame索引与切片
    2. #使用loc或者iloc访问
    3. #{行,列]
    4. df = pd.DataFrame({"one":pd.Series([1,2,3],index=["a","b","c"]),"two":pd.Series([2,1,3,4],index=["a","b","c","d"])})
    5. print(df)
    6. print(df["one"]["a"]) #先列后行:不推荐连续使用两个中括号
    7. print("*****取某一个元素*****")
    8. print(df.loc["a","one"]) #取某一个
    9. print("*****取某一列元素*****")
    10. print(df["one"],type(df["one"])) #看一列数据->Series
    11. print("*****取某一行元素*****")
    12. print(df.iloc[0]) #看一行数据
    13. print(df.loc["a",:]) #看一行数据,:切片表示所有
    14. print(df.loc["a",]) #看一行数据 ,后省略也是默认所有
    15. print(df.loc["a"]) #看一行数据->简写
    16. print("*****根据需求任意取元素*****")
    17. print(df.loc[["a","d"],:]) #常规索引 切片 布尔值索引 花式索引可以任意搭配
       one  two
    a  1.0    2
    b  2.0    1
    c  3.0    3
    d  NaN    4
    1.0
    *****取某一个元素*****
    1.0
    *****取某一列元素*****
    a    1.0
    b    2.0
    c    3.0
    d    NaN
    Name: one, dtype: float64 
    *****取某一行元素*****
    one    1.0
    two    2.0
    Name: a, dtype: float64
    one    1.0
    two    2.0
    Name: a, dtype: float64
    one    1.0
    two    2.0
    Name: a, dtype: float64
    one    1.0
    two    2.0
    Name: a, dtype: float64
    *****根据需求任意取元素*****
       one  two
    a  1.0    2
    d  NaN    4

    demo8-DataFrame数据对齐与缺失数据处理

    1. #demo8-DataFrame数据对齐与缺失数据处理
    2. df1 = pd.DataFrame({"two":[1,2,3,4],"one":[4,5,6,7]},index=list("cdba"))
    3. df2 = pd.DataFrame({"one":pd.Series([1,2,3],index=["a","b","c"]),"two":pd.Series([2,1,3,4],index=["a","b","c","d"])})
    4. print("*****DataFrame数据对齐*****")
    5. print(df1)
    6. print(df2)
    7. print(df1+df2) #行索引与列索引对齐相加
    8. #dropna(axis=0,where='any')
    9. #fillna(value)
    10. #isnull
    11. #notnull
    12. df2.loc["d","two"] = np.nan
    13. df2.loc["c","two"] = np.nan
    14. print(df2)
    15. print("*****fillna()*****")
    16. print(df2.fillna(999)) #填充值
    17. print("*****dropna:默认参数axis=0为行,how='any'即任意有nan即删除*****")
    18. print(df2.dropna()) #有一个缺失值,就删除一整行 默认how是any即任何一个,默认axis=0是行
    19. print(df2.dropna(how="any",axis=0))
    20. print(df2.dropna(how="all")) #所有都是nan才删除行
    21. print(df2.dropna(axis=1))#axis=是列

     

    *****DataFrame数据对齐*****
       two  one
    c    1    4
    d    2    5
    b    3    6
    a    4    7
       one  two
    a  1.0    2
    b  2.0    1
    c  3.0    3
    d  NaN    4
       one  two
    a  8.0    6
    b  8.0    4
    c  7.0    4
    d  NaN    6
       one  two
    a  1.0  2.0
    b  2.0  1.0
    c  3.0  NaN
    d  NaN  NaN
    *****fillna()*****
         one    two
    a    1.0    2.0
    b    2.0    1.0
    c    3.0  999.0
    d  999.0  999.0
    *****dropna*****
       one  two
    a  1.0  2.0
    b  2.0  1.0
       one  two
    a  1.0  2.0
    b  2.0  1.0
       one  two
    a  1.0  2.0
    b  2.0  1.0
    c  3.0  NaN
    Empty DataFrame
    Columns: []
    Index: [a, b, c, d]

    demo9-Pandas常用函数

    1. #demo9-Pandas常用函数
    2. #mean(axis=0,skipna=True)
    3. #sum(axis=1)
    4. #sort_index(axis,...,ascending)
    5. #sort_values(by,axis,ascending)
    6. #Numpy的函数同样适用于Pandas
    7. #axis=0表示跨行,axis=1表示跨列/通用函数与dropna联合理解
    8. #推荐资料:https://zhuanlan.zhihu.com/p/110105054
    9. print(df)
    10. print("*****mean*****")
    11. print(df.mean()) #按列求平均 (1+2+3)/3=2
    12. print(df.mean(axis=0)) #axis-跨行/默认按列
    13. print(df.mean(skipna=True))
    14. print(df.mean(axis=1)) #按行求平均
    15. print("*****sum*****")
    16. print(df.sum()) #默认按列
    17. print(df.sum(axis=1))
    18. print("*****sort_values*****") #nan不参与排序,无论升序或者降序均放在最后面
    19. print(df)
    20. print(df.sort_values(by="two")) #by:按某列排序
    21. print(df.sort_values(by="two",ascending=False)) #ascending意为上升,为False则为降序
    22. print(df.sort_values(axis=1,by="b"))
    23. print("*****sort_index*****")
    24. dfNew = pd.DataFrame({"two":[1,2,3,4],"one":[4,5,6,7]},index=list("cdba"))
    25. print(dfNew)
    26. print(dfNew.sort_index())
    27. print(dfNew.sort_index(ascending=False))
    28. print(dfNew.sort_index(axis=1))
    one  two
    a  1.0    2
    b  2.0    1
    c  3.0    3
    d  NaN    4
    *****mean*****
    one    2.0
    two    2.5
    dtype: float64
    one    2.0
    two    2.5
    dtype: float64
    one    2.0
    two    2.5
    dtype: float64
    a    1.5
    b    1.5
    c    3.0
    d    4.0
    dtype: float64
    *****sum*****
    one     6.0
    two    10.0
    dtype: float64
    a    3.0
    b    3.0
    c    6.0
    d    4.0
    dtype: float64
    *****sort_values*****
       one  two
    a  1.0    2
    b  2.0    1
    c  3.0    3
    d  NaN    4
       one  two
    b  2.0    1
    a  1.0    2
    c  3.0    3
    d  NaN    4
       one  two
    d  NaN    4
    c  3.0    3
    a  1.0    2
    b  2.0    1
       two  one
    a    2  1.0
    b    1  2.0
    c    3  3.0
    d    4  NaN
    *****sort_index*****
       two  one
    c    1    4
    d    2    5
    b    3    6
    a    4    7
       two  one
    a    4    7
    b    3    6
    c    1    4
    d    2    5
       two  one
    d    2    5
    c    1    4
    b    3    6
    a    4    7
       one  two
    c    4    1
    d    5    2
    b    6    3
    a    7    4

    demo10-时间对象

    1. #demo10-时间对象
    2. print("*****pd.to_datetime()*****")
    3. print(pd.to_datetime(["2021-7-20","1996.10.04","1997-MAY-20"]))
    4. print(type(pd.to_datetime(["2021-7-20","1996.10.04","1997-MAY-20"]))) #类型为DatetimeIndex通常用于做索引
    5. print(type(pd.to_datetime(["2021-7-20","1996.10.04","1997-MAY-20"])[0]))
    6. print("*****生成一定时间范围的时间对象*****")
    7. print(pd.date_range("2010.1.1","2010.5.1")) #指定start与end
    8. print(pd.date_range("2010.1.1",periods=30)) #period长度 freq默认是D天 W周
    9. print(pd.date_range("2010.1.1",periods=30,freq="h")) #freq为小时
    10. print(pd.date_range("2010.1.1",periods=30,freq="1h30min")) #freq为小时
    11. print(pd.date_range("2022.9.1",periods=30,freq="B")) #B为工作日
    12. print(type(pd.date_range("2022.9.1",periods=30,freq="B")))
    13. print(type(pd.date_range("2022.9.1",periods=30,freq="B")[0])) #类型为时间戳
    *****pd.to_datetime()*****
    DatetimeIndex(['2021-07-20', '1996-10-04', '1997-05-20'], dtype='datetime64[ns]', freq=None)
    
    
    *****生成一定时间范围的时间对象*****
    DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
                   '2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
                   '2010-01-09', '2010-01-10',
                   ...
                   '2010-04-22', '2010-04-23', '2010-04-24', '2010-04-25',
                   '2010-04-26', '2010-04-27', '2010-04-28', '2010-04-29',
                   '2010-04-30', '2010-05-01'],
                  dtype='datetime64[ns]', length=121, freq='D')
    DatetimeIndex(['2010-01-01', '2010-01-02', '2010-01-03', '2010-01-04',
                   '2010-01-05', '2010-01-06', '2010-01-07', '2010-01-08',
                   '2010-01-09', '2010-01-10', '2010-01-11', '2010-01-12',
                   '2010-01-13', '2010-01-14', '2010-01-15', '2010-01-16',
                   '2010-01-17', '2010-01-18', '2010-01-19', '2010-01-20',
                   '2010-01-21', '2010-01-22', '2010-01-23', '2010-01-24',
                   '2010-01-25', '2010-01-26', '2010-01-27', '2010-01-28',
                   '2010-01-29', '2010-01-30'],
                  dtype='datetime64[ns]', freq='D')
    DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 01:00:00',
                   '2010-01-01 02:00:00', '2010-01-01 03:00:00',
                   '2010-01-01 04:00:00', '2010-01-01 05:00:00',
                   '2010-01-01 06:00:00', '2010-01-01 07:00:00',
                   '2010-01-01 08:00:00', '2010-01-01 09:00:00',
                   '2010-01-01 10:00:00', '2010-01-01 11:00:00',
                   '2010-01-01 12:00:00', '2010-01-01 13:00:00',
                   '2010-01-01 14:00:00', '2010-01-01 15:00:00',
                   '2010-01-01 16:00:00', '2010-01-01 17:00:00',
                   '2010-01-01 18:00:00', '2010-01-01 19:00:00',
                   '2010-01-01 20:00:00', '2010-01-01 21:00:00',
                   '2010-01-01 22:00:00', '2010-01-01 23:00:00',
                   '2010-01-02 00:00:00', '2010-01-02 01:00:00',
                   '2010-01-02 02:00:00', '2010-01-02 03:00:00',
                   '2010-01-02 04:00:00', '2010-01-02 05:00:00'],
                  dtype='datetime64[ns]', freq='H')
    DatetimeIndex(['2010-01-01 00:00:00', '2010-01-01 01:30:00',
                   '2010-01-01 03:00:00', '2010-01-01 04:30:00',
                   '2010-01-01 06:00:00', '2010-01-01 07:30:00',
                   '2010-01-01 09:00:00', '2010-01-01 10:30:00',
                   '2010-01-01 12:00:00', '2010-01-01 13:30:00',
                   '2010-01-01 15:00:00', '2010-01-01 16:30:00',
                   '2010-01-01 18:00:00', '2010-01-01 19:30:00',
                   '2010-01-01 21:00:00', '2010-01-01 22:30:00',
                   '2010-01-02 00:00:00', '2010-01-02 01:30:00',
                   '2010-01-02 03:00:00', '2010-01-02 04:30:00',
                   '2010-01-02 06:00:00', '2010-01-02 07:30:00',
                   '2010-01-02 09:00:00', '2010-01-02 10:30:00',
                   '2010-01-02 12:00:00', '2010-01-02 13:30:00',
                   '2010-01-02 15:00:00', '2010-01-02 16:30:00',
                   '2010-01-02 18:00:00', '2010-01-02 19:30:00'],
                  dtype='datetime64[ns]', freq='90T')
    DatetimeIndex(['2022-09-01', '2022-09-02', '2022-09-05', '2022-09-06',
                   '2022-09-07', '2022-09-08', '2022-09-09', '2022-09-12',
                   '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16',
                   '2022-09-19', '2022-09-20', '2022-09-21', '2022-09-22',
                   '2022-09-23', '2022-09-26', '2022-09-27', '2022-09-28',
                   '2022-09-29', '2022-09-30', '2022-10-03', '2022-10-04',
                   '2022-10-05', '2022-10-06', '2022-10-07', '2022-10-10',
                   '2022-10-11', '2022-10-12'],
                  dtype='datetime64[ns]', freq='B')
    
    

    demo11-时间序列

    1. #demo11-时间序列
    2. print("*****时间对象索引*****")
    3. dateIndex = pd.date_range("2022.9.1",periods=66)
    4. sr = pd.Series(np.arange(66),index=dateIndex)
    5. print(sr.index) #索引为时间对象
    6. print(sr["2022-9-25":]) #从2022.9.25开始切片
    7. print(sr["2022-10":]) #从2022-10开始
    8. print(sr["2022-10"]) #只选择2022-10月
    9. print("*****resample重采样*****")
    10. print(sr.resample("w").sum())
    11. print(sr.resample("w-mon").sum())
    12. print(sr.resample("m").mean())
    *****时间对象索引*****
    DatetimeIndex(['2022-09-01', '2022-09-02', '2022-09-03', '2022-09-04',
                   '2022-09-05', '2022-09-06', '2022-09-07', '2022-09-08',
                   '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12',
                   '2022-09-13', '2022-09-14', '2022-09-15', '2022-09-16',
                   '2022-09-17', '2022-09-18', '2022-09-19', '2022-09-20',
                   '2022-09-21', '2022-09-22', '2022-09-23', '2022-09-24',
                   '2022-09-25', '2022-09-26', '2022-09-27', '2022-09-28',
                   '2022-09-29', '2022-09-30', '2022-10-01', '2022-10-02',
                   '2022-10-03', '2022-10-04', '2022-10-05', '2022-10-06',
                   '2022-10-07', '2022-10-08', '2022-10-09', '2022-10-10',
                   '2022-10-11', '2022-10-12', '2022-10-13', '2022-10-14',
                   '2022-10-15', '2022-10-16', '2022-10-17', '2022-10-18',
                   '2022-10-19', '2022-10-20', '2022-10-21', '2022-10-22',
                   '2022-10-23', '2022-10-24', '2022-10-25', '2022-10-26',
                   '2022-10-27', '2022-10-28', '2022-10-29', '2022-10-30',
                   '2022-10-31', '2022-11-01', '2022-11-02', '2022-11-03',
                   '2022-11-04', '2022-11-05'],
                  dtype='datetime64[ns]', freq='D')
    2022-09-25    24
    2022-09-26    25
    2022-09-27    26
    2022-09-28    27
    2022-09-29    28
    2022-09-30    29
    2022-10-01    30
    2022-10-02    31
    2022-10-03    32
    2022-10-04    33
    2022-10-05    34
    2022-10-06    35
    2022-10-07    36
    2022-10-08    37
    2022-10-09    38
    2022-10-10    39
    2022-10-11    40
    2022-10-12    41
    2022-10-13    42
    2022-10-14    43
    2022-10-15    44
    2022-10-16    45
    2022-10-17    46
    2022-10-18    47
    2022-10-19    48
    2022-10-20    49
    2022-10-21    50
    2022-10-22    51
    2022-10-23    52
    2022-10-24    53
    2022-10-25    54
    2022-10-26    55
    2022-10-27    56
    2022-10-28    57
    2022-10-29    58
    2022-10-30    59
    2022-10-31    60
    2022-11-01    61
    2022-11-02    62
    2022-11-03    63
    2022-11-04    64
    2022-11-05    65
    Freq: D, dtype: int32
    2022-10-01    30
    2022-10-02    31
    2022-10-03    32
    2022-10-04    33
    2022-10-05    34
    2022-10-06    35
    2022-10-07    36
    2022-10-08    37
    2022-10-09    38
    2022-10-10    39
    2022-10-11    40
    2022-10-12    41
    2022-10-13    42
    2022-10-14    43
    2022-10-15    44
    2022-10-16    45
    2022-10-17    46
    2022-10-18    47
    2022-10-19    48
    2022-10-20    49
    2022-10-21    50
    2022-10-22    51
    2022-10-23    52
    2022-10-24    53
    2022-10-25    54
    2022-10-26    55
    2022-10-27    56
    2022-10-28    57
    2022-10-29    58
    2022-10-30    59
    2022-10-31    60
    2022-11-01    61
    2022-11-02    62
    2022-11-03    63
    2022-11-04    64
    2022-11-05    65
    Freq: D, dtype: int32
    2022-10-01    30
    2022-10-02    31
    2022-10-03    32
    2022-10-04    33
    2022-10-05    34
    2022-10-06    35
    2022-10-07    36
    2022-10-08    37
    2022-10-09    38
    2022-10-10    39
    2022-10-11    40
    2022-10-12    41
    2022-10-13    42
    2022-10-14    43
    2022-10-15    44
    2022-10-16    45
    2022-10-17    46
    2022-10-18    47
    2022-10-19    48
    2022-10-20    49
    2022-10-21    50
    2022-10-22    51
    2022-10-23    52
    2022-10-24    53
    2022-10-25    54
    2022-10-26    55
    2022-10-27    56
    2022-10-28    57
    2022-10-29    58
    2022-10-30    59
    2022-10-31    60
    Freq: D, dtype: int32
    *****resample重采样*****
    2022-09-04      6
    2022-09-11     49
    2022-09-18     98
    2022-09-25    147
    2022-10-02    196
    2022-10-09    245
    2022-10-16    294
    2022-10-23    343
    2022-10-30    392
    2022-11-06    375
    Freq: W-SUN, dtype: int32
    2022-09-05     10
    2022-09-12     56
    2022-09-19    105
    2022-09-26    154
    2022-10-03    203
    2022-10-10    252
    2022-10-17    301
    2022-10-24    350
    2022-10-31    399
    2022-11-07    315
    Freq: W-MON, dtype: int32
    2022-09-30    14.5
    2022-10-31    45.0
    2022-11-30    63.0
    Freq: M, dtype: float64

    demo12-Pandas文件操作

    1. #demo12-Pandas文件操作
    2. #csv:分隔符为逗号
    3. #xlsx分隔符为制表符
    4. #read_csv参数:
    5. #index_col 选定某列作为index->可以使用数字或者列名 如index_col=0 或者 index_col="date"
    6. #parse_dates Ture则将所有可以解释为时间对象进行解释 / 也可以为列表,则将特定列解析为时间对象
    7. #header 指定文件无列名-read_csv默认会把第一行解释为列名,因此若文件无列名,则需要使用header指明无列名字
    8. #若header为None可以使用names参数指定列名
    9. #skip_rows 跳过某些行/不常用
    10. #na_values = [] 指定哪些字符串为Nan #因为数据的缺省值表示可能不一样,采用该参数统一为nan方便处理
    11. #data.columns = list("abcdefg") 修改列名
    12. #to_csv函数:
    13. #sep:指定分隔符,默认是,
    14. #na_rep:指定缺失值转换的字符串,默认是空字符串
    15. #header:不输出列名
    16. #index:不输出行索引
    17. #columns:指定输出的列,传入列表
    18. #to_json、to_excel、to_pickle.....等
    19. data = pd.read_csv("601318.csv",index_col="date")
    20. print(type(data.index))#此时index并非时间对象
    21. data = pd.read_csv("601318.csv",index_col="date",parse_dates=True)
    22. print(type(data.index))
    23. data = pd.read_csv("601318.csv",index_col="date",parse_dates=["date"])
    24. print(type(data.index))
    25. data = pd.read_csv("601318.csv",index_col="date")
    26. data.columns = list("abcdefg") #
    27. print(data)
    28. data = pd.read_csv("601318.csv",index_col="date",parse_dates=["date"])
    29. print(data)
    30. print(data.loc["2017-5","close"]) #important
    31. data.to_csv("test.csv",columns=["close","open"],header=False,index=False)
    
    
    
                   a       b       c       d       e           f       g
    date                                                                
    2007/3/1       0  21.878  20.473  22.302  20.040  1977633.51  601318
    2007/3/2       1  20.565  20.307  20.758  20.075   425048.32  601318
    2007/3/5       2  20.119  19.419  20.202  19.047   419196.74  601318
    2007/3/6       3  19.253  19.800  20.128  19.143   297727.88  601318
    2007/3/7       4  19.817  20.338  20.522  19.651   287463.78  601318
    ...          ...     ...     ...     ...     ...         ...     ...
    2017/12/11  2558  71.200  73.250  73.310  70.820  1139927.00  601318
    2017/12/12  2559  73.250  71.210  73.560  71.170   777900.00  601318
    2017/12/13  2560  71.210  72.120  72.620  70.200   865117.00  601318
    2017/12/14  2561  72.120  71.010  72.160  70.600   676186.00  601318
    2017/12/15  2562  70.690  70.380  71.440  70.050   735547.00  601318
    
    [2563 rows x 7 columns]
                Unnamed: 0    open   close    high     low      volume    code
    date                                                                      
    2007-03-01           0  21.878  20.473  22.302  20.040  1977633.51  601318
    2007-03-02           1  20.565  20.307  20.758  20.075   425048.32  601318
    2007-03-05           2  20.119  19.419  20.202  19.047   419196.74  601318
    2007-03-06           3  19.253  19.800  20.128  19.143   297727.88  601318
    2007-03-07           4  19.817  20.338  20.522  19.651   287463.78  601318
    ...                ...     ...     ...     ...     ...         ...     ...
    2017-12-11        2558  71.200  73.250  73.310  70.820  1139927.00  601318
    2017-12-12        2559  73.250  71.210  73.560  71.170   777900.00  601318
    2017-12-13        2560  71.210  72.120  72.620  70.200   865117.00  601318
    2017-12-14        2561  72.120  71.010  72.160  70.600   676186.00  601318
    2017-12-15        2562  70.690  70.380  71.440  70.050   735547.00  601318
    
    [2563 rows x 7 columns]
    date
    2017-05-02    37.167
    2017-05-03    37.255
    2017-05-04    37.079
    2017-05-05    36.530
    2017-05-08    37.049
    2017-05-09    37.245
    2017-05-10    39.059
    2017-05-11    38.990
    2017-05-12    40.147
    2017-05-15    40.098
    2017-05-16    40.285
    2017-05-17    39.628
    2017-05-18    39.824
    2017-05-19    40.206
    2017-05-22    42.000
    2017-05-23    42.324
    2017-05-24    42.186
    2017-05-25    44.598
    2017-05-26    44.294
    2017-05-31    44.187
    Name: close, dtype: float64
  • 相关阅读:
    数据结构 - AVL树
    F5服务器负载均衡能力如何?一文了解
    14.Java实现UDP通信
    电脑开机太慢?这5个方法瞬间提升你的电脑速度
    harbor 安装
    10.20作业
    Cpolar+Emlog搭建指南—在Ubuntu上轻松创建个人博客,无需专业技能
    Centos8安装docker并配置Kali Linux图形化界面
    详解Pinia和Vuex
    基于Php门禁系统设计与实现
  • 原文地址:https://blog.csdn.net/zsllsz2022/article/details/126780761