股票价格会受到各种小道消息的干扰,而这种小道消息对量化来讲很难控制(除非时时刻刻对网络所有资源进行监控,而这不现实)。
散户通常是小道消息的追捧者,所以我的想法是设计一个针对散户的爬虫。
思路:东方财富股吧有一个问董秘栏目,是散户和上市公司沟通的平台。个股的小道消息散播后,经常会有散户到问董秘栏目对传闻进行求证。
经过比较,问董秘平台是对上证e互动和深证互动易的汇总,但问董秘有两个问题:一是更新不及时;二是只显示已回复的问题。待董秘回复完并更新到该平台,则事件(传闻)的影响可能已经释放。
所以最终选择从上证e互动(http://sns.sseinfo.com/);深圳互动易(http://irm.cninfo.com.cn/szse/index.html)读取原始提问。
程序分为两个部分:第一部分利用python抓取上交所和深交所评论;第二部分利用jieba包进行文本分析,筛选出热门股票的评论和高频词汇。
注意:本程序只筛选了热点股票的评论和高频词汇,并没有进行词性分析。每天大概筛选出10-20只股票及其热点评论,需要人工对评论所涉及的事件进行具体分析。
突发事件是低频事件。每天筛选出股票并对评论进行分析后,很有可能做无用功。但一旦重大传闻出现,该程序可能就起到及时提醒的作用。
第一部分:抓取
1.来源1:上证e互动平台
2.来源2: 深证互动易
3.其他说明:之抓取csi800。利用了本地数据库。复制代码时,需要修改股票池==》line29:读取800成分股
1 # -*- coding:utf-8 -*- 2 from selenium import webdriver 3 import time 4 from selenium.webdriver.common.keys import Keys 5 from bs4 import BeautifulSoup 6 import re 7 import datetime 8 import pandas as pd 9 import jieba.analyse 10 import MySQLdb as mdb 11 import requests 12 from bs4 import BeautifulSoup 13 import bs4 14 15 chrome_options = webdriver.chrome.options.Options(); 16 chrome_options.add_argument('--headless') 17 driver = webdriver.Chrome(chrome_options=chrome_options) 18 19 def main(): 20 #0.0 时间初始化 21 today = datetime.datetime.now().date(); 22 tradedays=get_tradecalendar(today); 23 if today not in tradedays: 24 lastday = tradedays[-1] 25 else: 26 lastday=tradedays[tradedays.index(today)-1] 27 28 ##1.0 读取800成分股 29 members_800=get_index800(today); 30 h=0; 31 print 'crawler start' 32 ##上证e互动 和 深交所互动易 33 Failed_reading=[];Data=pd.DataFrame([]);j=0 34 for i in members_800: 35 code=i[0:6]; exchange=i[7:]; 36 h=h+1;print h,code 37 if exchange=='SH': 38 try: 39 x=get_ask_from_SHExchange(code,lastday,today) 40 if len(x) == 0: 41 continue 42 else: 43 x['code'] = i; 44 Data = Data.append(x) 45 except : 46 Failed_reading.append(i) 47 48 elif exchange=="SZ": 49 x=get_ask_from_SZExchange(code,lastday,today); 50 if len(x)==0: 51 continue 52 else: 53 x['code']=i; 54 Data = Data.append(x) 55 else: 56 continue; 57 Data.index=range(len(Data.index)) 58 Data.to_csv('comments_'+today.strftime("%Y%m%d")+'.xls',index=False,encoding='utf-8_sig') 59 print 'failed reading:', Failed_reading 60 61 def main1(): 62 today = datetime.datetime.now().date(); 63 tradedays = get_tradecalendar(today); 64 if today not in tradedays: 65 today=tradedays[-1]; 66 lastday = tradedays[tradedays.index(today) - 1] 67 print get_ask_from_SHExchange('600000',lastday,today) 68 69 ###读取上交所评论 70 def get_ask_from_SHExchange(code,yesterday,today): 71 # driver = webdriver.Chrome() 72 73 driver.get("http://sns.sseinfo.com") 74 # 搜索code 75 driver.find_element_by_id("com_search_txt").send_keys(code) 76 driver.find_element_by_id("to_companyByCode").click() 77 time.sleep(1) 78 # 点击“最新提问 79 driver.find_element_by_link_text('最新提问').click() 80 time.sleep(1) 81 js = "var q=document.documentElement.scrollTop=100000" 82 driver.execute_script(js) 83 time.sleep(3) 84 soup = BeautifulSoup(driver.page_source, features='lxml') 85 Tem = [] 86 for i in soup.find_all('div', attrs={'class': "m_feed_item m_question"}): 87 soup_i = BeautifulSoup(str(i), features='lxml') 88 question = soup_i.find('div', {'class': 'm_feed_txt'}).a.nextSibling.strip() 89 webtime = soup_i.find('div', {'class': 'm_feed_from'}).span.string.strip() 90 if '昨天' in webtime: ### 昨天08:50 91 t = (datetime.datetime.now() - datetime.timedelta(days=1)).date() 92 elif '前' in webtime: ##8分钟前 93 t = (datetime.datetime.now()).date() 94 else: 95 t = webtime[0:6] 96 t = datetime.datetime.strptime(u"2018年" + str(t), "%Y年%m月%d日").date() ##有误,全都按当年处理。后续应筛选 97 Tem.append([t, question]) 98 Data = pd.DataFrame(Tem, columns=['t', 'ask']); 99 Data=Data[(Data.t>=yesterday)&(Data.t<=today)].copy()100 Data.drop_duplicates(inplace=True);101 return Data102 103 ##深交所 互动易 ==》读取上一交易日到当天的评论104 def get_ask_from_SZExchange(code,yesterday,today):105 url = r"http://irm.cninfo.com.cn/ircs/interaction/lastQuestionforSzseSsgs.do?condition.type=2&condition.stockcode=" + code + "&condition.stocktype=S"106 r = requests.get(url)107 soup = BeautifulSoup(r.text, features='lxml')108 x = [];109 for i in soup.find_all(re.compile('p')):110 if (type(i.a) == bs4.element.Tag) & (type(i.span) == bs4.element.Tag):111 t = i.span.string.strip()[1:12];112 t = datetime.datetime.strptime(t, "%Y年%m月%d日").date()113 comment = i.a.string.strip()114 x.append([t, comment])115 x = pd.DataFrame(x, columns=['t', 'ask'])116 x = x[(x.t >= yesterday) & (x.t <= today)].copy();117 x.drop_duplicates(inplace=True);118 return x119 120 ###后续处理121 '''122 123 questions = ""124 for i in x.ask:125 questions = questions + i126 127 print questions128 129 for word in useless_words:130 questions = questions.replace(word, '');131 132 questions = questions.strip()133 print questions134 for x in jieba.analyse.extract_tags(questions, topK=8, withWeight=True, allowPOS=()):135 print x[0], x[1]136 '''137 138 139 def get_index800(today):140 start = today - datetime.timedelta(days=10); start = start.strftime("%Y%m%d");141 end = today; end = end.strftime("%Y%m%d");142 csi300 = get_indexmembers(start, end, '000300.SH'); csi300 = sorted(set(csi300.ticker.values));143 csi500 = get_indexmembers(start, end, '000905.SH'); csi500 = sorted(set(csi500.ticker.values));144 index800 = [];145 index800.extend(csi300);146 index800.extend(csi500)147 return index800148 149 150 151 def get_indexmembers(start,end,index_code):152 if index_code=='000300.SH':index_code='csi300';153 if index_code=='000905.SH':index_code='csi500';154 sql="select distinct ticker from Research.windIndexWgtsSSE where tradedate>='"+start+"' and tradedate<='"+end+"' and "+index_code+">0 ; "155 cnn=mdb.connect('10.10.40.310', 'report', 'raP1_Hdr2', 'Wind');156 cnn = cnn.cursor(mdb.cursors.SSDictCursor)157 cnn.execute(sql)158 dictionary = cnn.fetchall();159 table = pd.DataFrame(list(dictionary))160 return table;161 162 def get_tradecalendar(today):163 start = today - datetime.timedelta(days=10); start = start.strftime("%Y%m%d");164 end = today; end = end.strftime("%Y%m%d");165 sql="select distinct str_to_date(trade_days,'%Y%m%d') as trade_days from Wind.ASHARECALENDAR where trade_days>='"+start+"' and trade_days<='"+end+"' and s_info_exchmarket='SZSE' order by trade_days;"166 print sql167 cnn = mdb.connect('10.10.40.310', 'report', 'raP1_Hdr2', 'Wind');168 cnn = cnn.cursor(mdb.cursors.SSDictCursor)169 cnn.execute(sql)170 dictionary = cnn.fetchall();171 table = pd.DataFrame(list(dictionary))172 table=sorted(set(table.trade_days.values));173 return table;174 175 if __name__ == "__main__":176 main()
第二部分:文本处理:
我的目的是要关注热点事件,思路是选取评论数多,且不同评论间有重复词汇的证券
1.选取条件:最近两天评论数>3;高频词汇出现次数>3;
2.针对有些人重复评论,作了去重复处理
3.针对某一个人多次评论相似内容,可以对爬取评论者昵称并选取最长的一条。但是考虑到深交所爬取速度,我们并没有做这个处理。后续可以改进。
1 # -*- coding:utf-8 -*- 2 import pandas as pd 3 import jieba.analyse 4 import MySQLdb as mdb 5 from collections import Counter 6 import datetime 7 def main(): 8 today = datetime.datetime.now().date(); 9 useless_words = ['谢谢','上市','股价','有限公司','你好','的', '董秘', '公司', '管理层', '请问', '吗', '我', '您好','有没有', '有何', '方案', '贵司', '贵公司','问下','信心','感谢','们','是否','多久','应该','应当','建议','最好','为何','为什么','多少','!',',','。','?','、','…','……','...',',','哪些','已','贵','披露','公告','影响','股东','关心','客观'];#,'!',',','。','?','、','…','……','...',',' 10 useless_words.extend(['潜力','业务','投资者','是不是','如何','17','18','20']) 11 data=pd.read_csv('comments_'+today.strftime("%Y%m%d")+'.xls'); 12 grouped=data.groupby(['code']).size() 13 grouped=pd.DataFrame(grouped,columns=['ask_num']); 14 grouped['code'] = grouped.index;grouped.index=range(len(grouped)); 15 data=pd.merge(data,grouped,on=['code']); 16 data=data[data.ask_num>=3].copy(); 17 codes=sorted(set(data.code.values)); 18 ## 读取证券名称 19 Names=get_names_from_wind(codes) 20 data=pd.merge(data,Names,on=['code']) 21 22 data.sort_values(by=['ask_num'],ascending=False,inplace=True) 23 #print data 24 ## t,code,ask,ask_num,name 25 Lists=[] 26 for code in codes: 27 data_i=data[data.code==code].copy(); 28 n=data_i.ask_num.max(); 29 name_i=data_i.name.max() 30 ## 每一条评论筛选关键词,去除单条评论中的重复词语 31 questions='' 32 for j in data_i.ask: 33 keywords=[]; 34 for k in jieba.cut(j): 35 keywords.append(k); 36 keywords=set(keywords) 37 questions=questions+ ' '.join(keywords); 38 if code == '002642.SZ': 39 print questions; 40 ##将无意义词汇删除(打招呼、公司名、语气词等,股票简称) 41 name_i_ = [x for x in jieba.cut(name_i,cut_all=True,HMM=True)];## 列表。如果是iteration,则第二次不能迭代 42 for j in name_i_: 43 useless_words.append(j) 44 for word in useless_words: 45 questions=questions.replace(word,'') 46 for j in name_i_: 47 useless_words.pop(useless_words.index(j)) 48 questions=questions.strip()##只去除了两边的空格 49 50 keywords=jieba.analyse.extract_tags(questions, topK=5, withWeight=True, allowPOS=()); 51 if code == '002642.SZ': 52 print questions; 53 for x in keywords: 54 print x[0],x[1] 55 56 keyword1 = keywords[0][0]; keyword1_weight = keywords[0][1]; 57 keyword2 = keywords[1][0]; keyword2_weight = keywords[1][1]; 58 keyword3 = keywords[2][0]; keyword3_weight = keywords[2][1] 59 ##添加词(有些新生关键词,extract可以提取出来重复词,但cut不能识别) 60 keywords = [keyword1, keyword2, keyword3]; 61 for word in keywords: 62 jieba.add_word(word,tag=None) 63 ##统计关键词keywords1,2,3 出现次数 64 words_all=[];appeared_times=[] 65 66 for x in jieba.cut(questions,cut_all=True,HMM=True): 67 words_all.append(x) 68 69 keywords=[keyword1,keyword2,keyword3]; 70 for i in range(len(keywords)): 71 appeared_times.append(0) 72 73 for i in range(len(keywords)): 74 for word in words_all: 75 if len(word)==0:## 速度快 76 continue 77 elif word==keywords[i]: 78 appeared_times[i]=appeared_times[i]+1; 79 else: 80 continue 81 Lists.append([code,name_i,n,keyword1,appeared_times[0],keyword1_weight,keyword2,appeared_times[1],keyword2_weight,keyword3,appeared_times[2],keyword3_weight]) 82 results=pd.DataFrame(Lists,columns=['code','name','num','keyword1','n1','weight1','keyword2','n2','wight2','keyword3','n3','wight3']) 83 results=results[(results.n1>2)|(results.n2>2)|(results.n3>2)].copy() 84 results.sort_values(by=['num'],ascending=False,inplace=True);results.index=range(len(results.index)) 85 print results 86 87 results=results[['code','name','num','keyword1','n1','keyword2','n2','keyword3','n3']].copy() 88 results.to_csv('keywords_'+today.strftime("%Y%m%d")+'.xls',index=False,encoding='utf-8_sig') 89 90 data = pd.read_csv('comments_' + today.strftime("%Y%m%d") + '.xls'); 91 tem=results[['code','name']].copy(); 92 data=pd.merge(data,tem,on=['code']); 93 data.to_csv('comments_abnormal'+today.strftime("%Y%m%d")+'.xls',index=False,encoding='utf-8_sig') 94 95 96 97 ### codes is a list 98 def get_names_from_wind(codes): 99 codes="','".join(codes);100 codes="('"+codes+"')"101 sql="select s_info_windcode as code,s_info_name as name from Wind.ASHAREDESCRIPTION where s_info_windcode in "+codes+";"102 cnn = mdb.connect('10.10.40.310', 'report', 'raP1_Hdr2', 'Wind',charset='utf8');103 cnn = cnn.cursor(mdb.cursors.SSDictCursor)104 cnn.execute(sql)105 dictionary = cnn.fetchall();106 table = pd.DataFrame(list(dictionary))107 return table;108 109 if __name__=='__main__':110 main()
联系客服