细数SEO工作中给力的Python小脚本
人生苦短,我用Python。Python就像王者荣耀中的铭文或装备,强有力的武器可以更好地帮助您更好地刷野怪,更快地发育,从而通过等级差找突破口。
应用场景一:过滤敏感,不相关等杂七杂八关键词
# -*- coding: utf-8 -*-
op_txt=open('done.txt','a')
class NaiveFilter():
def __init__(self):
self.keywords = set([])
def parse(self, path):
for keyword in open(path):
self.keywords.add(keyword.strip().encode('utf-8').lower())
# print (self.keywords)
def filter(self, message, replss=r"*"):
message = unicode(message).lower()
for k in self.keywords:
if k in message:
message=message.replace(k, replss)
else:
op_txt.write('%s\n'%message)
print (message)
# return message
if __name__ == '__main__':
f = NaiveFilter()
f.parse("keywords") #keywords里面放要敏感词或不想要的词等
a=[i.strip() for i in open('hotword.txt').readlines()] #hotword.txt是将要过滤的词库
c=len(a)
for i in range(c):
f.filter(a[i])
应用场景二:结巴jieba分词计算高频词及TFIDF
#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
import jieba
import jieba.analyse #导入结巴jieba相关模块
output=open('words.csv','a')
output.write('词语,词频,词权\n')
stopkeyword=[line.strip() for line in open('stop.txt').readlines()] #将停止词文件保存到列表
text = open(r"new.txt","r").read() #导入需要计算的内容
zidian={}
fenci=jieba.cut_for_search(text)
for fc in fenci:
if fc in zidian:
zidian[fc]+=1
else:
# zidian.setdefault(fc,1) #字典中如果不存在键,就加入键,键值设置为1
zidian[fc]=1
#计算tfidf
tfidf=jieba.analyse.extract_tags(text,topK=30,withWeight=True)
#写入到csv
for word_weight in tfidf:
if word_weight in stopkeyword:
pass
else: #不存在的话就输出
print word_weight[0],zidian.get(word_weight[0],'not found'),str(int(word_weight[1]*100))+'%'
output.write('%s,%s,%s\n'%(word_weight[0],zidian.get(word_weight[0],'not found'),str(int(word_weight[1]*100))+'%'))
应用场景三:定向定时更新采集
#coding:utf-8
import urllib2,re,lxml,requests,time
from bs4 import BeautifulSoup
str_time=time.strftime('%Y-%m-%d',time.localtime())
op_txt=open('url.txt','a')
url = 'http://www.xxx.com/sitemap/group.htm'
html=requests.get(url).content
soup = BeautifulSoup(html,"lxml")
zidian={}
c=0
with open('url.txt') as f:
for i in f.readlines():
i=i.strip()
zidian['%s'%(i)]=c
c+=1
for urllist in re.findall(re.compile(r'<li>.*?href="(.*?)" target="_blank">(.*?)</a></li>'),str(soup)):
url_data=urllist[0].strip()
title=urllist[1]
if '2019' in title:
print title,url_data
if zidian.has_key(url_data):
print (u'没有更新'+str_time)
continue
else:
print (u'成功更新'+str_time)
op_txt.writelines('%s\n'%url_data)
应用场景四:百万级别一键生成sitemap文件
import time
date=time.strftime('%Y-%m-%d',time.localtime())
list=[url.strip() for url in open('url.txt').readlines()]
class sitemaps:
def __init__(self):
self.n=1
def name(self,c):
opxml=open('sitemap%s.xml'%c,'a')
opxml.write('''<?xml version="1.0" encoding="utf-8"?>
<urlset>\n''')
return opxml
def zhizuo(self,urllist):
xmldata=self.name(self.n)
# print '''<?xml version="1.0" encoding="utf-8"?>
# <urlset>'''
m=0
for i in urllist:
i=i.strip()
m+=1
sitemaps=''' <url>
<loc>'''+str(i)+'''</loc>
<lastmod>'''+date+'''</lastmod>
<priority>0.8</priority>
</url>\n'''
xmldata.write(sitemaps)
# print (sitemaps)
if m==50000:
self.n+=1
xmldata.write('</urlset>\n')
xmldata=self.name(self.n)
m=0
else:
pass
xmldata.write('</urlset>\n')
# print ('</urlset>')
if __name__ == '__main__':
p=sitemaps()
p.zhizuo(list)
应用场景五:合并目录下的所有日志文件
#coding=utf-8
import os
import sys
import glob
def dirTxtToLargeTxt(dir,outputFileName):
'''从dir目录下读入所有的TXT文件,将它们写到outputFileName里去'''
#如果dir不是目录返回错误
if not os.path.isdir(dir):
print ("传入的参数有错%s不是一个目录" %dir)
return False
#list all txt files in dir
outputFile = open(outputFileName,"a")
for txtFile in glob.glob(os.path.join(dir,"*.txt")):
print (txtFile)
inputFile = open(txtFile,"rb")
for line in inputFile:
outputFile.write(line)
return True
if __name__ =="__main__":
if len(sys.argv) < 3:
print ("Usage:%s dir outputFileName" %sys.argv[0])
sys.exit()
dirTxtToLargeTxt(sys.argv[1],sys.argv[2])
应用场景六:批量插入文章到帝国CMS数据库
#encoding=utf-8
import MySQLdb,time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
time.strftime('%Y-%m-%d',time.localtime(time.time()))
a='杨百辉博客'
b='http://bigwayseo.com/'
title_list=[]
title_list.append(int(time.time()))
title_list.append(a)
contnet_list = []
contnet_list.append(b)
try:
conn=MySQLdb.connect(host='localhost',user='root',passwd='root',db='empirecms',port=3306,charset='utf8')
cur=conn.cursor()
cur.execute("""INSERT INTO `phome_ecms_news` (`classid`, `userid`, `username`, `newstime`, `havehtml`, `title`, `smalltext`) VALUES ('2', '1', 'admin', %s, '1', %s, '')""",title_list)
cur.execute("""update `phome_ecms_news` set filename = @@IDENTITY where id = @@IDENTITY""")
cur.execute("""update `phome_ecms_news` set titleurl = concat('/sanwen/',@@IDENTITY,'.html') where id = @@IDENTITY""")
cur.execute("""INSERT INTO `phome_ecms_news_data_1` (`id`, `classid`, `dokey`, `newstext`) VALUES (@@IDENTITY, '2', '1', %s); """,contnet_list)
cur.execute("""INSERT INTO `phome_ecms_news_index` (`id`, `classid`, `checked`, `havehtml`) VALUES (@@IDENTITY, '2', '1', '1')""")
conn.commit()
cur.close()
conn.close()
except MySQLdb.Error,e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
合理适量地使用Python有助于提高工作效率,以及愉悦身心。有句话怎么说来着:少写怡情,多写伤发啊~
以上。
THE END
0
二维码
海报
细数SEO工作中给力的Python小脚本
人生苦短,我用Python。Python就像王者荣耀中的铭文或装备,强有力的武器可以更好地帮助您更好地刷野怪,更快地发育,从而通过等级差找突破口。
应用场景一……