https://github.com/RitterHou/music-163
爬取网易云音乐的所有的歌曲的评论数。以下为主要思路:
爬取所有的歌手信息(artists.py); 根据上一步爬取到的歌手信息去爬取所有的专辑信息(album_by _artist.py); 根据专辑信息爬取所有的歌曲信息(music_by _album.py); 根据歌曲信息爬取其评论条数(comments_by _music.py) 数据库相关的语句都存放于(sql.py)中。
import requests
from bs4 import BeautifulSoup
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; NTES_SESS=Fa2uk.YZsGoj59AgD6tRjTXGaJ8_1_4YvGfXUkS7C1NwtMe.tG1Vzr255TXM6yj2mKqTZzqFtoEKQrgewi9ZK60ylIqq5puaG6QIaNQ7EK5MTcRgHLOhqttDHfaI_vsBzB4bibfamzx1.fhlpqZh_FcnXUYQFw5F5KIBUmGJg7xdasvGf_EgfICWV; S_INFO=1476597594|1|0&80##|hourui93; NETEASE_AUTH_SOURCE=space; NETEASE_AUTH_USERNAME=hourui93; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=cbd082d2ce2cffbcd5c085d8bf565a95aee3173ddbbb00bfa270950f93f1d8bb4cb55a56a4049fa8c828373f630c78f4a43d6c3d252c4c44f44b098a9434a7d8fc110670a6e1e9af992c78092936b1e19351435ecff76a181993780035547fa5241a5afb96e8c665182d0d5b911663281967d675ff2658015887a94b3ee1575fa1956a5a%3A1476607977016; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476595468.1476606177.8; __utmb=94650624.20.10.1476606177; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'DNT': '1',
'Host': 'music.163.com',
'Pragma': 'no-cache',
'Referer': 'http://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
group_id = 4003
initial = 0
params = {'id': group_id, 'initial': initial}
r = requests.get('http://music.163.com/discover/artist/cat', params=params, headers=headers)
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'html.parser')
body = soup.body
hotartist_dic = {}
hot_artists = body.find_all('a', attrs={'class': 'msk'})
for artist in hot_artists:
artist_id = artist['href'].replace('/artist?id=', '').strip()
artist_name = artist['title'].replace('的音乐', '')
try:
hotartist_dic[artist_id] = artist_name
except Exception as e:
# 打印错误日志
print(e)
artist_dic = {}
artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'})
for artist in artists:
artist_id = artist['href'].replace('/artist?id=', '').strip()
artist_name = artist['title'].replace('的音乐', '')
try:
artist_dic[artist_id] = artist_name
except Exception as e:
# 打印错误日志
print(e)
artist_dic
{'1046093': 'Laxmikant-Pyarelal', '1049361': 'ГРУППА ПИЦЦА', '106666': 'Дрыгва', '106719': 'คาราบาว', '106733': 'ДДТ', '106998': '25 hours', '1078416': 'บีไฟว์', '1083129': '2nd Room', '1142003': 'Cutie*', '1143051': 'SkyLights', '1159115': '-Sunny-Youth-', '1184013': 'ForceZL', '1194100': 'GOMAR\xa0STUDIO', '1194110': 'ELYAR-PULAT', '12025447': 'Wukong Defunct', '12032215': '欧洲音厨', '12051100': '抛抛', '12070076': '惊奇海洋(Marvel ocean)', '1207047': 'eigenTunes亦听', '12071037': 'Weed', '1209010': '一个大G', '1211067': 'Quartz', '12111066': 'Husan', '12119141': '巅藏说唱团', '12122134': '365 DaBand', '12139012': 'Do\xa0Shit', '12147367': 'Near\xa0Death\xa0Experience/病亟', '12147390': 'WEON MASHUP', '12172418': 'ARnPRo', '12172468': 'MelodyGarden', '12173248': 'DJ宋雨飞', '12174281': 'PSYKHON', '12185319': 'RTG', '12185381': '漏音器\xa0The\xa0Lowincher', '12194273': '六甲番', '12194760': '菩提集团', '12194976': 'BNC$BrandNewCohort', '12200872': 'SIH SHANDIIN HOOLAI', '12200958': 'NiXaN说唱组合', '12204600': '9596乐队', '12236432': 'Mzoce', '12237434': 'Wiener Sängerknaben', '12258195': '向洋乐团', '12259787': '4oot', '12270840': '准噶尔乐队', '12271466': '魔幻之声口琴重奏团', '12275027': 'LightCould', '12276318': 'Cheetah\xa0Mobile Games', '12281709': 'RED', '12287113': 'GANGSAMOSA', '12288929': 'Elecrystal\xa0Sound\xa0Team', '12291554': 'H.M.Funk', '12317490': '廢兔Itsuki', '12324675': 'GumNam', '12357295': 'DAVID BORING', '12359012': 'ART\xa0RAP', '12359215': '野狼乐队', '12373475': 'Goodbye\xa0Honey\xa0Boy', '12392015': '木野', '12394484': '布日德组合', '12420908': 'INVADE', '12424041': 'İmera', '12497408': 'Controls', '12511077': 'Dahlia\xa0Rosea(玫瑰博士)', '12538438': 'Checkit', '12568487': '新疆Kelkvn说唱团体', '12580237': '增城捌贰陆大哥大保健娱乐有限公司', '12600335': '林如韵', '12610587': '千夜', '12641213': 'Poorman', '12641527': 'UMI NOISE', '12642917': 'AMTwo', '12648648': 'K.P.R.', '12768267': 'Waif\xa0&\xa0Zilch', '12814141': 'Unicorn_独角兽', '12897296': '444+222', '12924615': 'ShahMat', '12924892': 'K-AN', '12967340': 'The\xa0Messy梅西合唱团', '13022785': '988 DJs', '13037523': 'Liberation', '13110785': 'Jangaa组合', '13288090': '范世坤吉他私塾', '13416134': '骏景小学合唱团', '13612241': 'designco', '13612402': '清华大学键盘队', '13683852': 'ZebraZebra', '13701254': 'SHIRAQ', '13793310': '贝叔', '13793901': '王雪山', '13908376': 'Anuradha', '14119844': 'HASG\xa0POP', '14804020': 'Bana-X', '28186665': 'T&T', '30126361': '$uper旧街口', '30380343': 'Blast Way Z', '30644313': 'bltchigga', '818191': 'Мельница', '823055': '3.2.1', '942020': ' Johnyboy'}
def save_artist(group_id, initial, hot_artist_dic, artisti_dic):
params = {'id': group_id, 'initial': initial}
r = requests.get('http://music.163.com/discover/artist/cat', params=params)
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'html.parser')
body = soup.body
hot_artists = body.find_all('a', attrs={'class': 'msk'})
artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'})
for artist in hot_artists:
artist_id = artist['href'].replace('/artist?id=', '').strip()
artist_name = artist['title'].replace('的音乐', '')
try:
hot_artist_dic[artist_id] = artist_name
except Exception as e:
# 打印错误日志
print(e)
for artist in artists:
artist_id = artist['href'].replace('/artist?id=', '').strip()
artist_name = artist['title'].replace('的音乐', '')
try:
artist_dic[artist_id] = artist_name
except Exception as e:
# 打印错误日志
print(e)
#return artist_dic, hot_artist_dic
gg = 4003
initial = 0
artist_dic = {}
hot_artist_dic = {}
save_artist(gg, initial, hot_artist_dic, artist_dic )
artist_dic
{'1046093': 'Laxmikant-Pyarelal', '1049361': 'ГРУППА ПИЦЦА', '1050038': 'Фактор-2', '106666': 'Дрыгва', '106672': 'Камаедзiца', '106719': 'คาราบาว', '106733': 'ДДТ', '106985': 'Харизма', '106997': '17:28', '106998': '25 hours', '1078416': 'บีไฟว์', '1083129': '2nd Room', '1143051': 'SkyLights', '1158110': 'ChinaDJRadio', '1160013': '一个人的宇宙', '12082153': 'Самое большое простое число ', '12139012': 'Do\xa0Shit', '12191177': 'รวมศิลปิน Luster', '12194976': 'BNC$BrandNewCohort', '12200872': 'SIH SHANDIIN HOOLAI', '12204600': '9596乐队', '12215016': 'Groovy LIVE SYSU', '12424041': 'İmera', '12509248': 'Rabbit工作室', '12639444': 'NECROSADISTIC PUNISHMENT', '12641336': '南之南', '12699532': 'PDWN', '12788512': '拾旬乐', '12797604': '亥門', '12814141': 'Unicorn_official', '12867140': 'Xiao·Xin', '12897296': '444+222', '12897309': '蓝色妮可', '12924892': 'K-AN', '12943092': '堆填区', '12955345': 'HAKAN', '12967340': 'The\xa0Messy梅西合唱团', '12974253': "นา'กา", '12975240': 'ไมโคร', '12977209': 'เจนนี่ VS คอรี่', '13006273': '演绎', '13011233': '5 สาวฝุ่นตลบ', '13012236': 'อินคา', '13022208': 'เบิร์ด & เสก', '13023092': 'บางแก้ว', '13037921': 'MANGOMusic', '13109480': 'Memetjan_Alim', '13110785': 'Jangaa组合', '13151437': 'Karmashsa Ansambli', '13222504': 'Линник', '13222529': 'Улиткас', '13226421': 'てん、', '13226728': 'สักวา', '13227224': "'לוקץ", '13284125': 'The\xa0Singers\xa0of\xa0Lights', '13284192': 'Buzz Fridge', '13285024': 'NamelessTag無名標籤', '13429324': 'Олег Пунгин', '13430117': 'Конец фильма', '13430153': 'ГАЛЯЦЭНАГЕН', '13462069': 'Христина Соловій', '13464081': '#Холостячки', '13465021': 'שמעון ולוי', '13465093': 'שלמה ארצי', '13484124': 'จ่อย รวมมิตร', '13484209': 'Вика Курзова', '13484253': 'Фибры', '13485181': 'Леонид Руденко', '13485216': 'Шампанского Пожалуйста!', '13485217': 'Саша Ветер', '13485332': 'Сањар и Сам Я', '13486037': '\u200bPura Mashankura', '13486232': 'מור ברנשטיין', '13497013': '2 Of Us', '13497129': 'Артём Угловский', '13498112': 'เซ็กซ์ ตลาดแตก', '13498157': 'עילי בוטנר', '13500036': 'อ้อมใจ มหาหิงค์', '13500833': 'האחיות כרקוקלי', '13683852': 'ZebraZebra', '13707392': 'מועדון הקצב של אביהו פנחסוב', '13790097': 'Major\xa0Fifth', '13793613': 'NIIGELL\xa0QGQII', '13793901': '王雪山', '13908376': 'Anuradha', '13911469': 'Yellow Duck!', '14100185': 'โอ๊ะโอ', '14101200': 'สี่โพดำ', '711224': " C'mon Lennon", '713076': ' Dewa Budjana', '753041': 'ידידיה וגבריאל בלחסן', '763093': ' Пионерлагерь Пыльная Радуга', '784048': 'Чайф', '794269': 'ОбщежитиЕ ', '818191': 'Мельница', '823055': '3.2.1', '827392': 'Джинсовые мальчики', '908212': 'Психея ', '942020': ' Johnyboy', '957151': '██████'}
artist_dic = {}
hot_artist_dic = {}
for i in range(65, 91):
print(i)
save_artist(gg, i, hot_artist_dic, artist_dic )
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
len(hot_artist_dic)
254
len(artist_dic)
1608
list(hot_artist_dic.keys())[0]
'89659'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'DNT': '1',
'Host': 'music.163.com',
'Pragma': 'no-cache',
'Referer': 'http://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
def save_albums(artist_id, albume_dic):
params = {'id': artist_id, 'limit': '200'}
# 获取歌手个人主页
r = requests.get('http://music.163.com/artist/album', headers=headers, params=params)
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'html.parser')
body = soup.body
albums = body.find_all('a', attrs={'class': 'tit s-fc0'}) # 获取所有专辑
for album in albums:
albume_id = album['href'].replace('/album?id=', '')
albume_dic[albume_id] = artist_id
albume_dic = {}
save_albums('89659', albume_dic)
albume_dic
{'2903111': '89659', '37104113': '89659', '37104284': '89659', '37104348': '89659', '37104857': '89659', '37110077': '89659', '37110141': '89659', '37110256': '89659', '37110395': '89659', '37110462': '89659', '37110655': '89659', '37110751': '89659', '37110871': '89659', '37934081': '89659', '37934219': '89659'}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=fb5288e1c5f667324f1636d020704cab2f27ee915622b114f89027cbf60c38be2af6b9cbef2223c1f2581e3502f11b86efd60891d6f61b6f783c0d55114f8269fa801df7352f5cc4c8259876e563a6bd0212b504a8997723a0593b21d5b3d9076d4fa38c098be68e3c5d36d342e4a8e40c1f73378cec0b5851bd8a628886edbdd23a7093%3A1476623819662; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476610320.1476622020.10; __utmb=94650624.14.10.1476622020; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'DNT': '1',
'Host': 'music.163.com',
'Pragma': 'no-cache',
'Referer': 'http://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
}
def save_music(album_id, music_dic):
params = {'id': album_id}
# 获取专辑对应的页面
r = requests.get('http://music.163.com/album', headers=headers, params=params)
# 网页解析
soup = BeautifulSoup(r.content.decode(), 'html.parser')
body = soup.body
musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐
for music in musics:
music = music.find('a')
music_id = music['href'].replace('/song?id=', '')
music_name = music.getText()
music_dic[music_id] = [music_name, album_id]
list(albume_dic.keys())[0]
'37110871'
music_dic = {}
save_music('37110871', music_dic)
music_dic
{'527013176': ['อย่ารักใครข้างเดียว', '37110871'], '527013177': ['จะไม่รับปาก', '37110871'], '527013178': ['เจ้าหญิงนิทรา', '37110871'], '527013179': ['หุ่นกระป๋อง', '37110871'], '527013180': ['เธอจะอยู่กับฉันตลอดไป', '37110871'], '527013181': ['เมืองคนเหล็ก', '37110871'], '527013182': ['เพลงผีเสื้อ', '37110871'], '527013183': ['วังวน', '37110871'], '527013184': ['ปฏิเสธรัก', '37110871'], '527013185': ['ชีวิต มิตรภาพ ความรัก', '37110871'], '527013186': ['ปฏิเสธไม่ได้ว่ารักเธอ Feat. แบงค์ แคลช', '37110871'], '527013187': ['เพลงผีเสื้อ', '37110871'], '527013188': ['ชีวิต มิตรภาพ ความรัก Concert', '37110871']}
http://music.163.com/#/song?id=516997458
很遗憾的是评论数虽然也在详情页内,但是网易云音乐做了防爬处理,
我们就找一找这个API吧,通关观察XHR请求发现是下面这个家伙..
响应结果很丰富呢,所有评论相关的数据都有,不过经过观察发现这个API是经过加密处理的,不过没关系...
headers = {
'Host': 'music.163.com',
'Connection': 'keep-alive',
'Content-Length': '484',
'Cache-Control': 'max-age=0',
'Origin': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': '*/*',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
}
params = {
'csrf_token': ''
}
data = {
'params': '5L+s/X1qDy33tb2sjT6to2T4oxv89Fjg1aYRkjgzpNPR6hgCpp0YVjNoTLQAwWu9VYvKROPZQj6qTpBK+sUeJovyNHsnU9/StEfZwCOcKfECFFtAvoNIpulj1TDOtBir',
'encSecKey': '59079f3e07d6e240410018dc871bf9364f122b720c0735837d7916ac78d48a79ec06c6307e6a0e576605d6228bd0b377a96e1a7fc7c7ddc8f6a3dc6cc50746933352d4ec5cbe7bddd6dcb94de085a3b408d895ebfdf2f43a7c72fc783512b3c9efb860679a88ef21ccec5ff13592be450a1edebf981c0bf779b122ddbd825492'
}
limit是一页的数量,offset往后的偏移。
http://music.163.com/api/v1/resource/comments/R_SO_4_516997458?limit=20&offset=0
http://music.163.com/api/v1/resource/comments/R_SO_4_516997458?limit=20&offset=20
http://music.163.com/api/v1/resource/comments/R_SO_4_516997458?limit=20&offset=40
print(url)
http://music.163.com/api/v1/resource/comments/R_SO_4_516997458?limit=20&offset=0
offset = 0
music_id = '516997458'
url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_'+ music_id + '?limit=20&offset=' + str(offset)
response = requests.post(url, headers=headers, data=data)
cj = response.json()
cj.keys()
dict_keys(['total', 'topComments', 'hotComments', 'moreHot', 'more', 'userId', 'code', 'isMusician', 'comments'])
cj['total'],len(cj['comments']), len(cj['hotComments']), len(cj['topComments'])
(8054, 20, 15, 0)
cj['comments'][0]
{'beReplied': [], 'commentId': 1112523641, 'content': '喜欢双笙,喜欢这首歌', 'isRemoveHotComment': False, 'liked': False, 'likedCount': 1, 'pendantData': None, 'time': 1525904882188, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/Eklu6D8QoR1Hb5UhLhCzPw==/109951163288324813.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '狂妄嘻嘻', 'remarkName': None, 'userId': 1451756393, 'userType': 0, 'vipType': 0}}
offset = 20
music_id = '516997458'
url = 'http://music.163.com/api/v1/resource/comments/R_SO_4_'+ music_id + '?limit=20&offset=' + str(offset)
response = requests.post(url, headers=headers, data=data)
cj = response.json()
cj.keys()
dict_keys(['total', 'topComments', 'more', 'userId', 'isMusician', 'code', 'comments'])
len(cj['comments'])
20
cj['comments'][0]
{'beReplied': [], 'commentId': 1107837178, 'content': '冥月声音好听好温柔[爱心]表白', 'isRemoveHotComment': False, 'liked': False, 'likedCount': 3, 'pendantData': None, 'time': 1525515089450, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/suhvzXk2pEUOaeHUPU0aQQ==/109951163173870029.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '黴祇', 'remarkName': None, 'userId': 619018018, 'userType': 0, 'vipType': 0}}
from Crypto.Cipher import AES
import base64
import requests
import json
import time
# headers
headers = {
'Host': 'music.163.com',
'Connection': 'keep-alive',
'Content-Length': '484',
'Cache-Control': 'max-age=0',
'Origin': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': '*/*',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Cookie': 'JSESSIONID-WYYY=b66d89ed74ae9e94ead89b16e475556e763dd34f95e6ca357d06830a210abc7b685e82318b9d1d5b52ac4f4b9a55024c7a34024fddaee852404ed410933db994dcc0e398f61e670bfeea81105cbe098294e39ac566e1d5aa7232df741870ba1fe96e5cede8372ca587275d35c1a5d1b23a11e274a4c249afba03e20fa2dafb7a16eebdf6%3A1476373826753; _iuqxldmzr_=25; _ntes_nnid=7fa73e96706f26f3ada99abba6c4a6b2,1476372027128; _ntes_nuid=7fa73e96706f26f3ada99abba6c4a6b2; __utma=94650624.748605760.1476372027.1476372027.1476372027.1; __utmb=94650624.4.10.1476372027; __utmc=94650624; __utmz=94650624.1476372027.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
}
#获取params
def get_params(first_param, forth_param):
iv = "0102030405060708"
first_key = forth_param
second_key = 16 * 'F'
h_encText = AES_encrypt(first_param, first_key.encode(), iv.encode())
h_encText = AES_encrypt(h_encText.decode(), second_key.encode(), iv.encode())
return h_encText.decode()
# 获取encSecKey
def get_encSecKey():
encSecKey = "257348aecb5e556c066de214e531faadd1c55d814f9be95fd06d6bff9f4c7a41f831f6394d5a3fd2e3881736d94a02ca919d952872e7d0a50ebfa1769a7a62d512f5f1ca21aec60bc3819a9c3ffca5eca9a0dba6d6f7249b06f5965ecfff3695b54e1c28f3f624750ed39e7de08fc8493242e26dbc4484a01c76f739e135637c"
return encSecKey
# 解AES秘
def AES_encrypt(text, key, iv):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(key, AES.MODE_CBC, iv)
encrypt_text = encryptor.encrypt(text.encode())
encrypt_text = base64.b64encode(encrypt_text)
return encrypt_text
# 获取json数据
def get_json(url, data):
response = requests.post(url, headers=headers, data=data)
return response.content
# 传入post数据
def crypt_api(id, offset):
url = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_%s/?csrf_token=" % id
first_param = "{rid:\"\", offset:\"%s\", total:\"true\", limit:\"20\", csrf_token:\"\"}" % offset
forth_param = "0CoJUm6Qyw8W8jud"
params = get_params(first_param, forth_param)
encSecKey = get_encSecKey()
data = {
"params": params,
"encSecKey": encSecKey
}
return url, data
offset = 0
id = '516997458'
url, data = crypt_api(id, offset)
json_text = get_json(url, data)
json_dict = json.loads(json_text.decode("utf-8"))
comments_sum = json_dict['total']
comments_sum
8054
len(json_dict['comments'])
20
json_dict['comments'][0]
{'beReplied': [], 'commentId': 1112523641, 'content': '喜欢双笙,喜欢这首歌', 'isRemoveHotComment': False, 'liked': False, 'likedCount': 1, 'pendantData': None, 'time': 1525904882188, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/Eklu6D8QoR1Hb5UhLhCzPw==/109951163288324813.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '狂妄嘻嘻', 'remarkName': None, 'userId': 1451756393, 'userType': 0, 'vipType': 0}}
json_dict['comments'][4]
{'beReplied': [{'content': '我们历史老师是一个年轻的小伙子。那是个阳光明媚的中午,他拖堂拖了很久,喇叭里响起了学校广播“校园之声”的开场白,接着就是这首歌。老师听到这首歌前奏后,自以为是地说一定是播音员自己唱的。我们都在下面反驳他,说人家歌就是这样的。。\n而现在,距中考只有58天了,毕业后,就回不去了。', 'status': 0, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/gm976KYbWTvYvExzjBNeaw==/109951163217371336.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '惴洛', 'remarkName': None, 'userId': 1325932231, 'userType': 0, 'vipType': 0}}], 'commentId': 1112261542, 'content': '还有不到一个月了高一学姐祝你考试加油哦', 'isRemoveHotComment': False, 'liked': False, 'likedCount': 0, 'pendantData': None, 'time': 1525876023865, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/kAuCCkW-fcC7yu4wix9z5Q==/109951163144186242.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '土园yy', 'remarkName': None, 'userId': 275653796, 'userType': 0, 'vipType': 0}}
offset = 20
id = '516997458'
url, data = crypt_api(id, offset)
json_text = get_json(url, data)
json_dict = json.loads(json_text.decode("utf-8"))
comments_sum = json_dict['total']
json_dict['comments'][0]
{'beReplied': [], 'commentId': 1107837178, 'content': '冥月声音好听好温柔[爱心]表白', 'isRemoveHotComment': False, 'liked': False, 'likedCount': 3, 'pendantData': None, 'time': 1525515089450, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/suhvzXk2pEUOaeHUPU0aQQ==/109951163173870029.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '黴祇', 'remarkName': None, 'userId': 619018018, 'userType': 0, 'vipType': 0}}
offset = 40
id = '516997458'
url, data = crypt_api(id, offset)
json_text = get_json(url, data)
json_dict = json.loads(json_text.decode("utf-8"))
comments_sum = json_dict['total']
json_dict['comments'][0]
{'beReplied': [], 'commentId': 1102303635, 'content': '找这首歌找了好久了!!无厘头的找,今天无意居然听到了(*^▽^)/★*☆', 'isRemoveHotComment': False, 'liked': False, 'likedCount': 1, 'pendantData': None, 'time': 1525072647936, 'user': {'authStatus': 0, 'avatarUrl': 'http://p1.music.126.net/fU8tvMVN2f5WkSUZehQ21Q==/3274345636764863.jpg', 'expertTags': None, 'experts': None, 'locationInfo': None, 'nickname': '黎诺0', 'remarkName': None, 'userId': 129375977, 'userType': 0, 'vipType': 0}}
800/1018*460
361.49312377210214