import sqlite3
con = sqlite3.connect("chinook.db")
tracks = pd.read_sql_query("SELECT * FROM tracks", con)
tracks.head()
|
TrackId |
Name |
AlbumId |
MediaTypeId |
GenreId |
Composer |
Milliseconds |
Bytes |
UnitPrice |
0 |
1 |
For Those About To Rock (We Salute You) |
1 |
1 |
1 |
Angus Young, Malcolm Young, Brian Johnson |
343719 |
11170334 |
0.99 |
1 |
2 |
Balls to the Wall |
2 |
2 |
1 |
None |
342562 |
5510424 |
0.99 |
2 |
3 |
Fast As a Shark |
3 |
2 |
1 |
F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho... |
230619 |
3990994 |
0.99 |
3 |
4 |
Restless and Wild |
3 |
2 |
1 |
F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D... |
252051 |
4331779 |
0.99 |
4 |
5 |
Princess of the Dawn |
3 |
2 |
1 |
Deaffy & R.A. Smith-Diesel |
375418 |
6290521 |
0.99 |
# read_sql_query函数可以读取一张表,第一个参数是表名,第二个参数是引擎
genres = pd.read_sql_query("SELECT * FROM genres", con)
genres.head()
|
GenreId |
Name |
0 |
1 |
Rock |
1 |
2 |
Jazz |
2 |
3 |
Metal |
3 |
4 |
Alternative & Punk |
4 |
5 |
Rock And Roll |
genre_track = genres.merge(tracks[['GenreId', 'Milliseconds']],
on='GenreId', how='left').drop('GenreId', axis='columns')
genre_track.head()
|
GenreId |
Name |
0 |
1 |
Rock |
1 |
2 |
Jazz |
2 |
3 |
Metal |
3 |
4 |
Alternative & Punk |
4 |
5 |
Rock And Roll |
# 找到每种类型歌曲的平均时长
genre_time = genre_track.groupby('Name')['Milliseconds'].mean()
pd.to_timedelta(genre_time, unit='ms').dt.floor('s').sort_values()
Name
Rock And Roll 00:02:14
Opera 00:02:54
Hip Hop/Rap 00:02:58
Easy Listening 00:03:09
Bossa Nova 00:03:39
R&B/Soul 00:03:40
World 00:03:44
Pop 00:03:49
Latin 00:03:52
Alternative & Punk 00:03:54
Soundtrack 00:04:04
Reggae 00:04:07
Alternative 00:04:24
Blues 00:04:30
Rock 00:04:43
Jazz 00:04:51
Classical 00:04:53
Heavy Metal 00:04:57
Electronica/Dance 00:05:02
Metal 00:05:09
Comedy 00:26:25
TV Shows 00:35:45
Drama 00:42:55
Science Fiction 00:43:45
Sci Fi & Fantasy 00:48:31
Name: Milliseconds, dtype: timedelta64[ns]
#找到每名顾客花费的总时长
cust = pd.read_sql_query("SELECT * FROM customers", con)
cust=cust[['CustomerId', 'FirstName', 'LastName']]
invoice = pd.read_sql_query("SELECT * FROM invoices", con)
invoice=invoice[['InvoiceId','CustomerId']]
ii = pd.read_sql_query("SELECT * FROM invoice_items", con)
ii=ii[['InvoiceId', 'UnitPrice', 'Quantity']]
cust_inv = cust.merge(invoice, on='CustomerId') \
.merge(ii, on='InvoiceId')
cust_inv.head()
|
CustomerId |
FirstName |
LastName |
InvoiceId |
UnitPrice |
Quantity |
0 |
1 |
Luís |
Gonçalves |
98 |
1.99 |
1 |
1 |
1 |
Luís |
Gonçalves |
98 |
1.99 |
1 |
2 |
1 |
Luís |
Gonçalves |
121 |
0.99 |
1 |
3 |
1 |
Luís |
Gonçalves |
121 |
0.99 |
1 |
4 |
1 |
Luís |
Gonçalves |
121 |
0.99 |
1 |
# 现在可以用总量乘以单位价格,找到每名顾客的总消费
total = cust_inv['Quantity'] * cust_inv['UnitPrice']
cols = ['CustomerId', 'FirstName', 'LastName']
cust_inv.assign(Total = total).groupby(cols)['Total'].sum().sort_values(ascending=False).head()
CustomerId FirstName LastName
6 Helena Holý 49.62
26 Richard Cunningham 47.62
57 Luis Rojas 46.62
46 Hugh O'Reilly 45.62
45 Ladislav Kovács 45.62
Name: Total, dtype: float64
# sql语句查询方法read_sql_query
pd.read_sql_query('select * from tracks limit 5', con)
|
TrackId |
Name |
AlbumId |
MediaTypeId |
GenreId |
Composer |
Milliseconds |
Bytes |
UnitPrice |
0 |
1 |
For Those About To Rock (We Salute You) |
1 |
1 |
1 |
Angus Young, Malcolm Young, Brian Johnson |
343719 |
11170334 |
0.99 |
1 |
2 |
Balls to the Wall |
2 |
2 |
1 |
None |
342562 |
5510424 |
0.99 |
2 |
3 |
Fast As a Shark |
3 |
2 |
1 |
F. Baltes, S. Kaufman, U. Dirkscneider & W. Ho... |
230619 |
3990994 |
0.99 |
3 |
4 |
Restless and Wild |
3 |
2 |
1 |
F. Baltes, R.A. Smith-Diesel, S. Kaufman, U. D... |
252051 |
4331779 |
0.99 |
4 |
5 |
Princess of the Dawn |
3 |
2 |
1 |
Deaffy & R.A. Smith-Diesel |
375418 |
6290521 |
0.99 |
# 可以将长字符串传给read_sql_query
sql_string1 = '''
select
Name,
time(avg(Milliseconds) / 1000, 'unixepoch') as avg_time
from (
select
g.Name,
t.Milliseconds
from
genres as g
join
tracks as t
on
g.genreid == t.genreid
)
group by
Name
order by
avg_time
'''
pd.read_sql_query(sql_string1, con).head()
|
Name |
avg_time |
0 |
Rock And Roll |
00:02:14 |
1 |
Opera |
00:02:54 |
2 |
Hip Hop/Rap |
00:02:58 |
3 |
Easy Listening |
00:03:09 |
4 |
Bossa Nova |
00:03:39 |
sql_string2 = '''
select
c.customerid,
c.FirstName,
c.LastName,
sum(ii.quantity * ii.unitprice) as Total
from
customers as c
join
invoices as i
on c.customerid = i.customerid
join
invoice_items as ii
on i.invoiceid = ii.invoiceid
group by
c.customerid, c.FirstName, c.LastName
order by
Total desc
'''
pd.read_sql_query(sql_string2, con).head()
|
CustomerId |
FirstName |
LastName |
Total |
0 |
6 |
Helena |
Holý |
49.62 |
1 |
26 |
Richard |
Cunningham |
47.62 |
2 |
57 |
Luis |
Rojas |
46.62 |
3 |
45 |
Ladislav |
Kovács |
45.62 |
4 |
46 |
Hugh |
O'Reilly |
45.62 |