Scott's Blog

学则不固, 知则不惑

0%

Pandas 数据导入

一些常用数据导入的代码示例与技巧。

从网络

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Import package
from urllib.request import urlretrieve

# Import pandas
import pandas as pd

# Assign url of file: url
url = 'https://you_url/you_file.csv'

# Save file locally
urlretrieve(url,'you_file.csv')

# Read file into a DataFrame and print its head
df = pd.read_csv('you_file.csv', sep=';')
print(df.head())

直接用PD读取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# Import packages
import matplotlib.pyplot as plt
import pandas as pd

# Assign url of file: url
url = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1606/datasets/winequality-red.csv'


# Read file into a DataFrame: df
df = pd.read_csv(url,sep=';')

# Print the head of the DataFrame
print(df.head())

# Plot first column of df
pd.DataFrame.hist(df.ix[:, 0:1])
plt.xlabel('fixed acidity (g(tartaric acid)/dm$^3$)')
plt.ylabel('count')
plt.show()

网络读取excel文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# Import package
import pandas as pd

# Assign url of file: url
url = 'http://s3.amazonaws.com/assets.datacamp.com/course/importing_data_into_r/latitude.xls'


# Read in all sheets of Excel file: xl
xl = pd.read_excel(url,sheetname = None)

# Print the sheetnames to the shell
print(xl.keys())

# Print the head of the first sheet (using its name, NOT its index)
print(xl['1700'].head())

从 Json文件

1
2
3
4
5
6
7
# Load JSON: json_data
with open("a_movie.json") as json_file:
json_data = json.load(json_file)

# Print each key-value pair in json_data
for k in json_data.keys():
print(k + ': ', json_data[k])

从 api 请求 json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Import package
import requests

# Assign URL to variable: url
url = 'http://www.omdbapi.com/?apikey=72bc447a&t=social+network'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print each key-value pair in json_data
for k in json_data.keys():
print(k + ': ', json_data[k])

从 API 如:维基百科

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
# Import package
import requests

# Assign URL to variable: url
url = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&format=json&exintro=&titles=pizza'

# Package the request, send the request and catch the response: r
r = requests.get(url)

# Decode the JSON data into a dictionary: json_data
json_data = r.json()

# Print the Wikipedia page extract
pizza_extract = json_data['query']['pages']['24768']['extract']
print(pizza_extract)

从 API 如: 推特

1
2
3
4
5
6
7
8
9
10
11
12
13
# Import package
import tweepy

# Store OAuth authentication credentials in relevant variables
access_token = ""
access_token_secret = ""
consumer_key = ""
consumer_secret = ""

# Pass OAuth details to tweepy's OAuth handler
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token,access_token_secret)

条件导入

1
2
3
4
5
6
7
8
# 跳过第一行,至需要第一和第三列
data = np.loadtxt(file, delimiter='\t', skiprows=1, usecols=[0,2])

# delimiter分隔符
data = np.genfromtxt('titanic.csv', delimiter=',', names=True, dtype=None)

# nrows,只需要前面5行,header只可以等于None,而不是Bool
pd.read_csv(file,nrows=5, header=None)

获取文件列表

1
2
3
import os
wd = os.getcwd()
os.listdir(wd)

从 excel

1
2
3
4
5
6
7
8
9
10
11
# Import pandas
import pandas as pd

# Assign spreadsheet filename: file
file = 'battledeath.xlsx'

# Load spreadsheet: xl
xl = pd.ExcelFile(file)

# Print sheet names
print(xl.sheet_names)

从 SAS 文件

1
2
3
4
5
6
From sas7bdat import SAS7BDAT

Data = pd.read_stata('urbanpop.dta')
# sas文件变成dataframe
with SAS7BDAT('sales.sas7bdat') as file:
df_sas = file.to_data_frame()

从 mat文件

1
2
3
4
5
6
7
8
# Import package
import scipy.io

# Load MATLAB file: mat
mat = scipy.io.loadmat('albeck_gene_expression.mat')

# Print the datatype type of mat
print(type(mat))

从数据库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
# Import packages
from sqlalchemy import create_engine
import pandas as pd

# Create engine: engine
engine = create_engine('sqlite:///Chinook.sqlite')

# Open engine connection: con
con = engine.connect()

# Perform query: rs
rs = con.execute('select * from Album')

# Save results of the query to DataFrame: df
df = pd.DataFrame(rs.fetchall())

# Close connection
con.close()

# Print head of DataFrame df
print(df.head())
从DB导入指定条数据,重命名列名:
with engine.connect() as con:
rs = con.execute('SELECT LastName, Title FROM Employee')
df = pd.DataFrame(rs.fetchmany(size=3))
df.columns = rs.keys()

# Print the length of the DataFrame df
print(len(df))

# Print the head of the DataFrame df
print(df.head())

提高数据导入速度