import pandas as pd
# 读取texas_cities数据集
cities = pd.read_csv('data/texas_cities.csv')
cities
|
City |
Geolocation |
0 |
Houston |
29.7604° N, 95.3698° W |
1 |
Dallas |
32.7767° N, 96.7970° W |
2 |
Austin |
30.2672° N, 97.7431° W |
# 将Geolocation分解为四个单独的列 #expand表示是否把series类型转化为DataFrame类型
geolocations = cities.Geolocation.str.split(pat='. ', expand=True)
geolocations.columns = ['latitude', 'latitude direction', 'longitude', 'longitude direction']
geolocations
|
latitude |
latitude direction |
longitude |
longitude direction |
0 |
29.7604 |
N |
95.3698 |
W |
1 |
32.7767 |
N |
96.7970 |
W |
2 |
30.2672 |
N |
97.7431 |
W |
# 转变数据类型
geolocations = geolocations.astype({'latitude':'float', 'longitude':'float'})
geolocations.dtypes
'''
latitude float64
latitude direction object
longitude float64
longitude direction object
dtype: object
'''
# 将新列与原先的city列连起来
cities_tidy = pd.concat([cities['City'], geolocations], axis='columns')
cities_tidy
|
City |
latitude |
latitude direction |
longitude |
longitude direction |
0 |
Houston |
29.7604 |
N |
95.3698 |
W |
1 |
Dallas |
32.7767 |
N |
96.7970 |
W |
2 |
Austin |
30.2672 |
N |
97.7431 |
W |
原理
# 函数to_numeric可以将每列自动变为整数或浮点数
temp = geolocations.apply(pd.to_numeric, errors='ignore')
temp
|
latitude |
latitude direction |
longitude |
longitude direction |
0 |
29.7604 |
N |
95.3698 |
W |
1 |
32.7767 |
N |
96.7970 |
W |
2 |
30.2672 |
N |
97.7431 |
W |
# 再查看数据类型
temp.dtypes
'''
latitude float64
latitude direction object
longitude float64
longitude direction object
dtype: object
'''
# |符,可以对多个标记进行分割
cities.Geolocation.str.split(pat='° |, ', expand=True)
|
0 |
1 |
2 |
3 |
0 |
29.7604 |
N |
95.3698 |
W |
1 |
32.7767 |
N |
96.7970 |
W |
2 |
30.2672 |
N |
97.7431 |
W |
# 更复杂的提取方式
cities.Geolocation.str.extract('([0-9.]+). (N|S), ([0-9.]+). (E|W)', expand=True)
|
0 |
1 |
2 |
3 |
0 |
29.7604 |
N |
95.3698 |
W |
1 |
32.7767 |
N |
96.7970 |
W |
2 |
30.2672 |
N |
97.7431 |
W |