Pandas读取.db数据库信息,并处理数据

import sqlite3
con = sqlite3.connect("chinook.db")
tracks = pd.read_sql_query("SELECT * FROM tracks", con)
tracks.head()
  TrackId Name AlbumId MediaTypeId GenreId Composer Milliseconds Bytes UnitPrice
0 1 For Those About To Rock (We Salute You) 1 1 1 Angus Young, Malcolm Young, Brian Johnson 343719 11170334 0.99
1 2 Balls to the Wall 2 2 1 None 342562 5510424 0.99
2 3 Fast As a Shark 3 2 1 F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho... 230619 3990994 0.99
3 4 Restless and Wild 3 2 1 F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D... 252051 4331779 0.99
4 5 Princess of the Dawn 3 2 1 Deaffy & R.A. Smith-Diesel 375418 6290521 0.99
# read_sql_query函数可以读取一张表,第一个参数是表名,第二个参数是引擎
genres = pd.read_sql_query("SELECT * FROM genres", con)
genres.head()
  GenreId Name
0 1 Rock
1 2 Jazz
2 3 Metal
3 4 Alternative & Punk
4 5 Rock And Roll
genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']],
                           on='GenreId', how='left').drop('GenreId', axis='columns')
genre_track.head()
  GenreId Name
0 1 Rock
1 2 Jazz
2 3 Metal
3 4 Alternative & Punk
4 5 Rock And Roll
# 找到每种类型歌曲的平均时长
genre_time = genre_track.groupby('Name')['Milliseconds'].mean()
pd.to_timedelta(genre_time, unit='ms').dt.floor('s').sort_values()

Name
Rock And Roll        00:02:14
Opera                00:02:54
Hip Hop/Rap          00:02:58
Easy Listening       00:03:09
Bossa Nova           00:03:39
R&B/Soul             00:03:40
World                00:03:44
Pop                  00:03:49
Latin                00:03:52
Alternative & Punk   00:03:54
Soundtrack           00:04:04
Reggae               00:04:07
Alternative          00:04:24
Blues                00:04:30
Rock                 00:04:43
Jazz                 00:04:51
Classical            00:04:53
Heavy Metal          00:04:57
Electronica/Dance    00:05:02
Metal                00:05:09
Comedy               00:26:25
TV Shows             00:35:45
Drama                00:42:55
Science Fiction      00:43:45
Sci Fi & Fantasy     00:48:31
Name: Milliseconds, dtype: timedelta64[ns]
#找到每名顾客花费的总时长
cust = pd.read_sql_query("SELECT * FROM customers", con)                    
cust=cust[['CustomerId', 'FirstName', 'LastName']]

invoice = pd.read_sql_query("SELECT * FROM invoices", con)                    
invoice=invoice[['InvoiceId','CustomerId']]

ii = pd.read_sql_query("SELECT * FROM invoice_items", con)                    
ii=ii[['InvoiceId', 'UnitPrice', 'Quantity']]
cust_inv = cust.merge(invoice, on='CustomerId') \
                         .merge(ii, on='InvoiceId')
cust_inv.head()
  CustomerId FirstName LastName InvoiceId UnitPrice Quantity
0 1 Luís Gonçalves 98 1.99 1
1 1 Luís Gonçalves 98 1.99 1
2 1 Luís Gonçalves 121 0.99 1
3 1 Luís Gonçalves 121 0.99 1
4 1 Luís Gonçalves 121 0.99 1
# 现在可以用总量乘以单位价格,找到每名顾客的总消费
total = cust_inv['Quantity'] * cust_inv['UnitPrice']
cols = ['CustomerId', 'FirstName', 'LastName']
cust_inv.assign(Total = total).groupby(cols)['Total'].sum().sort_values(ascending=False).head()

CustomerId  FirstName  LastName  
6           Helena     Holý          49.62
26          Richard    Cunningham    47.62
57          Luis       Rojas         46.62
46          Hugh       O'Reilly      45.62
45          Ladislav   Kovács        45.62
Name: Total, dtype: float64
# sql语句查询方法read_sql_query
pd.read_sql_query('select * from tracks limit 5', con)
  TrackId Name AlbumId MediaTypeId GenreId Composer Milliseconds Bytes UnitPrice
0 1 For Those About To Rock (We Salute You) 1 1 1 Angus Young, Malcolm Young, Brian Johnson 343719 11170334 0.99
1 2 Balls to the Wall 2 2 1 None 342562 5510424 0.99
2 3 Fast As a Shark 3 2 1 F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho... 230619 3990994 0.99
3 4 Restless and Wild 3 2 1 F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D... 252051 4331779 0.99
4 5 Princess of the Dawn 3 2 1 Deaffy & R.A. Smith-Diesel 375418 6290521 0.99
# 可以将长字符串传给read_sql_query
sql_string1 = '''
          select 
              Name, 
              time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time
          from (
                  select 
                      g.Name, 
                      t.Milliseconds
                  from 
                      genres as g 
                  join
                      tracks as t
                      on 
                          g.genreid == t.genreid
              )
          group by 
              Name
          order by 
              avg_time
          '''
pd.read_sql_query(sql_string1, con).head()
 
  Name avg_time
0 Rock And Roll 00:02:14
1 Opera 00:02:54
2 Hip Hop/Rap 00:02:58
3 Easy Listening 00:03:09
4 Bossa Nova 00:03:39
sql_string2 = '''
select 
                c.customerid, 
                c.FirstName, 
                c.LastName, 
                sum(ii.quantity *  ii.unitprice) as Total
          from
                customers as c
          join
                invoices as i
                     on c.customerid = i.customerid
          join
                invoice_items as ii
                     on i.invoiceid = ii.invoiceid
          group by
                c.customerid, c.FirstName, c.LastName
          order by
                Total desc
          '''
pd.read_sql_query(sql_string2, con).head()
  CustomerId FirstName LastName Total
0 6 Helena Holý 49.62
1 26 Richard Cunningham 47.62
2 57 Luis Rojas 46.62
3 45 Ladislav Kovács 45.62
4 46 Hugh O'Reilly 45.62

猜你喜欢

转载自blog.csdn.net/weixin_48135624/article/details/114046523
今日推荐