K-디지털트레이닝(빅데이터) 22일차

10916 단어 KDT pandas python KDT

오늘은 melt(), 반복문, 자료형변환, 슬라이스를 배웠다.

melt 메서드 사용하기

1. 1개의 열만 고정하고 나머지 열을 행으로 바꾸기

import pandas as pd
pew = pd.read_csv('data2/pew.csv')   # 미국의 소득과 종교 데이터 불러오기 
print(pew.head(10))

print(pew.iloc[:, 0:6])   # 5개의 columns 출력해보기 

pew_long = pd.melt(pew, id_vars='religion')   # id_vars :위치 유지, religion 열을 고정하여 피벗하였다. 
print(pew_long.head(20))                      # column 데이터를 행으로 보내기 

pew_long = pd.melt(pew, id_vars='religion', var_name='income', value_name='count')   # variable, value name 변경하기 
print(pew_long.head())

5. 2개 이상의 열을 고정하고 나머지 열을 행으로 바꾸기

billboard = pd.read_csv('data2/billboard.csv')

print(billboard.iloc[0:5, 0:16])

billboard_long = pd.melt(billboard, id_vars=['year', 'artist', 'track', 'time', 'date.entered'], var_name='week', value_name='rating')

print(billboard_long.head())

ebola 데이터 집합 살펴보기

ebola = pd.read_csv('data2/country_timeseries.csv')
print(ebola.columns)

ebola

print(ebola.iloc[:5, [0, 1, 2, 3, 10, 11]])   # 보고싶은 columns 뽑기 

ebola_long = pd.melt(ebola, id_vars=['Date', 'Day'])  # Date, Day로 피벗하기 
print(ebola_long.head())

열 이름 분리하고 데이터프레임에 추가하기

variable_split = ebola_long.variable.str.split('_')    #split 매서드 사용하여 분리하기 

print(variable_split[:10])

print(type(variable_split))  #variable_split type은 시리즈 

print(type(variable_split[0]))  #각각의 시리즈에 저장된 값은 자료형 리스트 

status_values = variable_split.str.get(0) # 0번째 인덱스는 cases, deaths 의 상태 나타냄 
country_values = variable_split.str.get(1) # 1번째 인덱스 문자열은 나라 이름을 나타냄 / get메서드를 사용하여 인덱스 추출 

print(status_values[:5])             # 두 문자열을 분리하여 새로운 열로 추가하기 

print(status_values[-5:])

print(country_values[:5])

print(country_values[-5:])

ebola_long['status'] = status_values   # columns 이름 정의 해주기 
ebola_long['country'] = country_values
print(ebola_long.head())

기상 데이터의 여러 열을 하나로 정리하기 ─ melt, pivot_table 메서드

weather = pd.read_csv('data2/weather.csv') 
print(weather.iloc[:5, :11])

weather

weather_melt = pd.melt(weather, id_vars=['id', 'year', 'month', 'element'], var_name='day', value_name='temp') 
print(weather_melt.head())

weather_tidy = weather_melt.pivot_table(
    index=['id', 'year', 'month', 'day'], #위치를 그대로 할 열 이름 넣기
    columns='element', # 피벗할 열 이름 지정하기 
    values='temp' # 새로운 열의 데이터가 될 열 이름 지정
)

print(weather_tidy)

weather_tidy_flat = weather_tidy.reset_index()  # weather_tidy를 reset_index 메서드로 새로 지정함
print(weather_tidy_flat.head())

빌보드 차트의 중복 데이터 처리하기

billboard = pd.read_csv('data2/billboard.csv')
billboard

billboard_long = pd.melt(billboard, id_vars=['year', 'artist', 'track', 'time', 'date.entered'], var_name='week', value_name='rating')


print(billboard_long.head())

print(billboard_long.head(5))

print(billboard_long[billboard_long.track == 'Loser'].head()) # 노래이름이 Loser데이터 뽑기 

billboard_long[billboard_long.track == 'Loser']

billboard_songs = billboard_long[['year', 'artist', 'track', 'time','date.entered']]  # 중복데이터를 가지고 있는 열 따로 모아 데이터 프레임에 저장
print(billboard_songs.shape)

billboard_songs = billboard_songs.drop_duplicates() #drop_duplicates 사용하여 중복데이터 제거 
print(billboard_songs.shape)

billboard_songs

billboard_songs['id'] = range(len(billboard_songs))  # id 추가하기 
billboard_songs

billboard_long

billboard_ratings = billboard_long.merge( billboard_songs, on=['year', 'artist', 'track', 'time','date.entered']) 
print(billboard_ratings.shape)

billboard_ratings

뉴욕 택시 데이터 준비

import os 
import urllib.request
with open('data2/raw_data_urls.txt', 'r') as data_urls:
    for line, url in enumerate(data_urls):
        if line == 5:
            break 
        fn = url.split('/')[-1].strip()
        fp = os.path.join('', '../data', fn)
        print(url)
        print(fp)
        urllib.request.urlretrieve(url, fp)

import glob 
nyc_taxi_data = glob.glob('../data/fhv_*') 
print(nyc_taxi_data)

taxi1 = pd.read_csv(nyc_taxi_data[0]) 
taxi2 = pd.read_csv(nyc_taxi_data[1]) 
taxi3 = pd.read_csv(nyc_taxi_data[2]) 
taxi4 = pd.read_csv(nyc_taxi_data[3]) 
taxi5 = pd.read_csv(nyc_taxi_data[4])

print(taxi1.head(n=2)) 
print(taxi2.head(n=2)) 
print(taxi3.head(n=2)) 
print(taxi4.head(n=2)) 
print(taxi5.head(n=2))

print(taxi1.shape) 
print(taxi2.shape) 
print(taxi3.shape) 
print(taxi4.shape) 
print(taxi5.shape)

taxi = pd.concat([taxi1, taxi2, taxi3, taxi4, taxi5])

print(taxi.shape)

반복문으로 데이터 준비하기

list_taxi_df = [] 

for csv_filename in nyc_taxi_data:
    # print(csv_filename)
    df = pd.read_csv(csv_filename)
    list_taxi_df.append(df) 

print(len(list_taxi_df))

print(type(list_taxi_df[0]))

print(list_taxi_df[0].head())

taxi_loop_concat = pd.concat(list_taxi_df) 
print(taxi_loop_concat.shape)

print(taxi.equals(taxi_loop_concat))

자료형을 자유자재로 변환하기 ─ astype 메서드

import pandas as pd
import seaborn as sns

tips = sns.load_dataset("tips")

2. 여러 가지 자료형을 문자열로 변환하기

tips

tips.dtypes

tips['sex_str'] = tips['sex'].astype(str)
print(tips.dtypes)

4. 자료형을 변환한 데이터 다시 원래대로 만들기

tips['total_bill'] = tips['total_bill'].astype(str)  # total_bill type object로 만들기 
print(tips.dtypes)

tips['total_bill'] = tips['total_bill'].astype(float) # 다시 total_bill type float로 만들기 
print(tips.dtypes)

잘못 입력한 문자열 처리하기 ─ to_numeric 메서드

tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1, 3, 5, 7], 'total_bill'] = 'missing'  # 1,3,5,7행 데이터 missing으로 변경하여 tips_sub_miss로 저장

print(tips_sub_miss)

print(tips_sub_miss.dtypes)     #total_bill 

tips_sub_miss['total_bill'].astype(float)   # 'missing'을 float로 변경하는 방법을 모름 

pd.to_numeric(tips_sub_miss['total_bill']) #to_numeric 메서드 사용하기 

                                          # pd.to_numeric(숫자로 변경할 대상, errors='ignore/raise/coerce')

tips_sub_miss['total_bill'] = pd.to_numeric( tips_sub_miss['total_bill'], errors='ignore') # errors='ignore 오류무시 

print(tips_sub_miss.dtypes)

tips_sub_miss['total_bill'] = pd.to_numeric( tips_sub_miss['total_bill'], errors='coerce') #missing 이 누락값으로 바뀜

print(tips_sub_miss.dtypes)

tips_sub_miss['total_bill'] = pd.to_numeric( tips_sub_miss['total_bill'], errors='coerce', downcast='float')

print(tips_sub_miss.dtypes)                   # downcast 정수, 실수와 같은 자료형을 더 작은 형태로 만들때 사용

문자열을 카테고리로 변환하기(179쪽)

tips['sex'] = tips['sex'].astype('str')   # sex열의 type object일때 
print(tips.info())

tips['sex'] = tips['sex'].astype('category')   # sex열의 type category일때 
print(tips.info())

문자열 추출하기

word = 'grail'
sent = 'a scratch'

word[0]

sent[0]

word[0:3]

---

print(sent[-1])

print(sent[-9:-8])

print(sent[0:-8])

전체 문자열을 추출할 때 음수를 사용하면 안 됩니다

sent

print(sent[2:-1])    # 슬라이싱 방향이 오른쪽이라 전체 출력이 안됨 

print(sent[-7:-1])   # -7부터 오른쪽 방향으로 슬라이싱 따라서 출력 안됨 

s_len = len(sent)      # len 메서드를 사용하여 문장열 범위 정의 하고 
print(s_len)

print(sent[2:s_len])    # 문자열 범위를 넣어주기

왼쪽이나 오른쪽 범위를 지정하지 않고 문자열 추출하기

print(word[0:3])

print(word[ :3])

print(sent[2:len(sent)])

print(sent[2: ])    # 오른쪽 범위를 비우면 문자열의 마지막 위치까지 문자열 추출

print(sent[ : ])   # 전체를 비우면 전체 문자열 추출

print(sent[::2])    # 전체 문자열 추출하되 거리가 2인 문자 추출

join, splitlines, replace 메서드 실습하기

1. join 메서드

d1 = '40°' 
m1 = "46'" 
s1 = '52.837"' 
u1 = 'N'

d2 = '73°' 
m2 = "58'" 
s2 = '26.302"' 
u2 = 'W'

coords = '*'.join([d1, m1, s1, u1, d2, m2, s2, u2])    # join 메서드 사용하여 문자열 연결 
print(coords)

2. splitlines 메서드

multi_str = """Guard: What? Ridden on a horse?
King Arthur: Yes!
Guard: You're using coconuts!
King Arthur: What?
Guard: You've got ... coconut[s] and you're bangin' 'em together. 
""" 
print(multi_str)

multi_str_split = multi_str.splitlines()    # 여러 행을 가진 문자열을 분리하여 리스트로 반환함 
print(multi_str_split)

guard = multi_str_split[::2]    #인덱스 슬라이싱 하기 
print(guard)

4. replace 메서드

guard = multi_str.replace("Guard:", " ").splitlines()[::2]   # replace 문자열을 치환해주는 역할 
print(guard)

문자열 포매팅하기

var = 'flesh wound' 
s = "It's just a {}!"

print(s.format(var))

print(s.format('scratch'))

s = """Black Knight: 'Tis but a {0}.
King Arthur: A {0}? Your arm's off!
""" 
print(s.format('scratch'))

s = 'Hayden Planetarium Coordinates: {lat}, {lon}' 
print(s.format(lat='40.7815°N', lon='73.9733°W'))

숫자 데이터 포매팅하기

print('Some digits of pi: {}'.format(3.14159265359))

print("In 2005, Lu Chao of China recited {:,} digits of pi".format(67890))  # , 로 구분해주기 

print("I remember {0:.4} or {0:.4%} of what Lu Chao recited".format(7/67890))  #소수점 이하의 숫자 4개까지 출력 

print("My ID number is {0:05d}".format(42))  #5자리 숫자로 만들기

% 연산자로 포매팅하기

s = 'I only know %d digits of pi' % 7 
print(s)

print('Some digits of %(cont)s: %(value).2f' % {'cont': 'e', 'value': 2.718})   # %,s 사이 문자 입력,  value뒤에 2f 는 소수점 2자리까지

f-strings로 포매팅 사용하기

var = 'flesh wound' 
s = f"It's just a {var}!" 
print(s)

lat='40.7815°N' 
lon='73.9733°W' 
s = f'Hayden Planetarium Coordinates: {lat}, {lon}' 
print(s)

정규식으로 전화번호 패턴 찾기

import re

tele_num = '1234567890'

m = re.match(pattern='\d\d\d\d\d\d\d\d\d\d', string=tele_num) 
print(type(m))

print(m)

print(bool(m))

if m:
    print('match') 
else:
    print('no match')

print(m.start())

print(m.end())

print(m.span())

print(m.group())

tele_num_spaces = '123 456 7890'

m = re.match(pattern='\d{10}', string=tele_num_spaces) 
print(m)

if m:
    print('match') 
else:
    print('no match')

p = '\d{3}\s?\d{3}\s?\d{4}' 
m = re.match(pattern=p, string=tele_num_spaces) 
print(m)

tele_num_space_paren_dash = '(123) 456-7890' 
p = '\(?\d{3}\)?\s?\d{3}\s?-?\d{4}' 
m = re.match(pattern=p, string=tele_num_space_paren_dash) 
print(m)

cnty_tele_num_space_paren_dash = '+1 (123) 456-7890' 
p = '\+?1\s?\(?\d{3}\)?\s?\d{3}\s?-?\d{4}' 
m = re.match(pattern=p, string=cnty_tele_num_space_paren_dash) 
print(m)

compile 메서드로 정규식 메서드 사용하기

p = re.compile('\d{10}') 
s = '1234567890' 
m = p.match(s) 
print(m)

Author And Source

이 문제에 관하여(K-디지털트레이닝(빅데이터) 22일차), 우리는 이곳에서 더 많은 자료를 발견하고 링크를 클릭하여 보았다 https://velog.io/@y7y1h13/K-디지털트레이닝빅데이터-22일차

우수한 개발자 콘텐츠 발견에 전념 (Collection and Share based on the CC Protocol.)

Coredata 너무 어려워 요.

Apache Flink 학습 노트 (2)

좋은 웹페이지 즐겨찾기

개발자 우수 사이트 수집

개발자가 알아야 할 필수 사이트 100선 추천 우리는 당신을 위해 100개의 자주 사용하는 개발자 학습 사이트를 정리했습니다