2018-03-19 杭电所有课程爬取的代码 python

一天毛钊岚=。=想让我把杭电所有的课程爬下来,于是经过一天的构思和一天的查资料还是完成了=。=嘻嘻嘻嘻

正在爬取=。=

我所使用的是selenium和re 模块就够了,正好锻炼我re的能力

因为在爬取的时候我发现并不能用request,杭电官网的加载是用JavaScript来做请求的,发现request并不能一键爬取于是上网查找了selemium的资料写出来了=。=

下面是代码=。=,还有就是chromedriver一定要和Chrome的版本对应不然是不能实现的

 

import re
from selenium import webdriver
import time,os

url='http://jxgl.hdu.edu.cn/jxrwcx.aspx'
file=open('kechengmingchen.txt','w')

def openwindow():
    driver = webdriver.Chrome(executable_path='C:\\Users\\assu\\PycharmProjects\\CTF\\学校课程\\chromedriver.exe')
    # driver.maximize_window()
    driver.get(url)
    for i in range(1,50000):
        yemian,yeshu=tiqu(driver.page_source)
        print("第{}页".format(yeshu[0]))
        # print(len(yemian),yemian,sep='\n')
        for j in range(1,len(yemian)):
            file.write(yemian[j][1]+'\n')
        time.sleep(1)
        if i <11:
            xpath='//*[@id="DBGrid"]/tbody/tr[16]/td/a[{}]'.format(i)
        else:
            if i%10==0:
                xpath = '//*[@id="DBGrid"]/tbody/tr[16]/td/a[{}]'.format(11)
            else:
                xpath = '//*[@id="DBGrid"]/tbody/tr[16]/td/a[{}]'.format(i%10+1)
        try:
            select=driver.find_element_by_xpath(xpath=xpath)
            print("要按的页数"+select.text)
            select.click()
        except:
            if yeshu==i+1:
                continue
            print("最后一页为第{}页".format(yeshu[0]))
            break

def tiqu(text):
    regax='<td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td><td>(.*?)</td>'
    all_re=re.findall(regax,text)
    reg='<span>(.*?)</span>'
    arr_1=re.findall(reg,text)
    return all_re,arr_1

openwindow()
file.close()

全部44行=。=还是挺简单的呢

 

结果是这样的=。=

当然如果你要加更多的话也是可以的,看下代码你就知道了=。=
杭电的反爬真的差=。=

后来跟o爷爷聊了一下杭电的所有课程爬取

他也给了我一份代码,他没有用selenium比我的快很多。

比较推荐他的,嘻嘻嘻嘻嘻

import random
import re
import time

import requests
from bs4 import BeautifulSoup

session = requests.Session()
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")

semester = input('第几学期? ')

if semester == '2':
 r = requests.post(
 url,
 data={
 'ddlXY': None, 'ddlJS': None, 'kcmc': None, 'ddlXN': '2017-2018',
 'ddlXQ': semester, 'DropDownList1': 'kcmc', 'TextBox1': None,
 '__LASTFOCUS': None, '__EVENTARGUMENT': None,
 '__EVENTTARGET': 'DBGrid$ctl18$ctl01',
 '__VIEWSTATE': soup.find(id='__VIEWSTATE')['value'],
 '__EVENTVALIDATION': soup.find(id='__EVENTVALIDATION')['value'],
 })
 soup = BeautifulSoup(r.content, "html5lib")

f = open('semester%s.csv' % semester, 'w')
f.write('''"开课状态","课程名称","学分","考核方式","课程性质","任课教师",\
"选课课号","起止周","上课时间","上课地点","开课学院","合班信息"\n''')

first_ten_page = True
first_page = True
page = 1 if semester == '1' else 0
total = 1 if semester == '1' else 0

try:
 while True:
 trs = soup.find('table', id='DBGrid').find_all('tr')[1:-1]
 for tr in trs:
 tds = list(tr.children)[1:-1]
 if first_page and semester == '2':
 print('Changing to semester 2...')
 break
 else:
 f.write('"' + '","'.join([i.text for i in tds]) + '"\n')
 print('[%3d] Got: %s' % (total, tds[1].text))
 total += 1

r = requests.post(
 url,
 data={
 'ddlXY': None, 'ddlJS': None, 'kcmc': None, 'ddlXN': '2017-2018',
 'ddlXQ': semester, 'DropDownList1': 'kcmc', 'TextBox1': None,
 '__LASTFOCUS': None, '__EVENTARGUMENT': None,
 '__EVENTTARGET': 'DBGrid$ctl18$ctl%02d'%page,
 '__VIEWSTATE': soup.find(id='__VIEWSTATE')['value'],
 '__EVENTVALIDATION': soup.find(id='__EVENTVALIDATION')['value'],
 },
 )
 soup = BeautifulSoup(r.content, "html5lib")
 time.sleep(1)
 if (page == 10 and first_ten_page) or page == 11:
 page = 2
 first_ten_page = False
 else:
 page += 1
 first_page = False
 f.flush()

except KeyboardInterrupt as e:
 f.close()
 print("exit now")

发表评论

电子邮件地址不会被公开。 必填项已用*标注