Mindfulness中西方思想观念对比
并没有弃掉机器学习,只是在忙这个
update2
update
最终Report在这里:[pdf-embedder url=”https://www.yhzq-blog.cc/wp-content/uploads/2019/09/Common-and-Difference-of-Mindfulness-Interpretation-in-China-and-the-United-States.pdf” title=”Common and Difference of Mindfulness Interpretation in China and the United States”]
用英文写的,因为要给professor看看。。
配色都是随机的,可以多刷新几下试试
原文:
选了一门心理学,主要学mindfulness和ACT。。。教授人真的好,就做个小项目看看有没有bonus吧
基本思想:分别Google mindfulness和正念,爬虫扒下所有网页,词频分析下
首先Google的搜索结果肉身墙外的可以用这个,奈何我校v-n-p无法全局使用。。。就用了个特别特别蠢的方法:将Google的搜索结果调到100个,保存网页+正则匹配,搞出来网址。正则表达式为:re.findall(r"<a href=\"https*://www.(?!google|youtube).*?\" ping", s)
过滤了Google自己的各种服务和YouTube的
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
file1 = open("./mindfulness - Google Search.htm", "r")
s = file1.read()
file2 = open("./eng-go.txt", "a")
s1 = re.findall(r"<a href=\"https*://www.(?!google|youtube).*?\" ping", s)
for i in range(len(s1)):
s1[i] = re.findall(r"http.*?\"", s1[i])
s1[i] = s1[i][0][:-1]
print(len(s1))
for i in range(len(s1)):
file2.write(s1[i]+'\n')
然后问题就是google的中文网页检索样本量过小,遂又搞了百度的,正则为href=\"(.*?)target=\"_blank\" rel="noopener noreferrer"><em>
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
file1 = open("./正念_百度搜索.htm", "r")
s = file1.read()
file2 = open("./chs2.txt", "a")
s1 = re.findall(r"href=\"(.*?)target=\"_blank\"><em>", s)
for i in range(len(s1)):
s1[i] = re.findall(r"http.*?\"", s1[i])
s1[i] = s1[i][0][:-1]
print(len(s1))
for i in range(len(s1)):
file2.write(s1[i]+'\n')
然后就是要搞到网页内容,还是用了个轮子。建议还是requests,有些网页不仿造header扒不下来。程序大概这样
from newspaper import fulltext
import requests
import re
from termcolor import colored
i = 0
count = 0
with open('./eng-go.txt') as f:
for line in f:
i = i + 1
fl = open("eng" + str(i) + ".txt", "w")
url = line
print(colored("Time:" + str(i), 'green'))
print(colored("Trying to connect: ", 'green') + colored(url, 'blue'))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
}
try:
res = requests.get(url, headers = headers, timeout = 5).text
print(colored("Connect Succeed!", 'green'))
except requests.exceptions.RequestException as e:
print(colored("Connect Failed!", 'red'))
continue
try:
text = fulltext(res, 'en')
except AttributeError:
print(colored("Failed to convert!", 'red'))
continue
fl.write(text)
count = count + 1
print(colored("\n================Success!==================\n No." + str(count) + "\n", 'green'))
fl.close()
放在图书馆跑了一下午。。。
跑完了内容都在几百个文本里面,先写了个把文档搞在一起,然后就是分词了。
英文的分词很简单,中文的我用的jieba,最后懒得改英文也用了这个。这个程序用了下pandas,感觉不大方便以后就换了
分词需要有stopword,比较好找,在gayhub一搜就行了。
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
import codecs
def is_Chinese(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
stopwords = [line.strip() for line in codecs.open('stopwords.txt', 'r', 'utf-8').readlines()]
jieba.analyse.set_stop_words('stopwords.txt')
segments = []
s = open("chsall.txt", "r").read()
# get words
words = jieba.cut(s)
splitedStr = ''
for word in words:
if word not in stopwords:
if (is_Chinese(word) == False):
continue
segments.append({'word': word, "count": 1})
splitedStr += word + ' '
dfSg = pd.DataFrame(segments)
dfWord = dfSg.groupby('word')['count'].sum()
dfWord.to_csv('keywords.csv', encoding = 'utf-8')
然后就是恶心的翻译了。。。。中文要翻译成英文,英文还要翻回来。。每个都1w多词。我感觉我让DKU的IP至少被5个翻译网址给封IP了。Google给了个API,说着free无限制,最后还是封了。百度和金山词霸都被封掉了。我仅剩的就是有道翻译,它还是比较人性化的,好像只封几小时,会恢复。
我要想个折中的方案,比较好的选择就是词典,但是大部分的词典库都是给出一坨解释,根本没法用。最后我到一个很小的网址(不封爬虫!!!万岁!!!)爬了每个单词的最前的释义,代码写得奇丑无比。。。(比上面的都丑)我就不放了。反正最后扒了大概2w词,我以为差不多了,但是一翻译还是很多没有,于是我就本地+云端结合翻译,这样有道的服务器应该不会压力太大。而且我加上了学习,每次防止封IP被搞下来就每次学习查下来的翻译。超丑代码如下:(看看意思就行了,很多意义不明的东西都没来得及删掉
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
from nltk.stem.snowball import SnowballStemmer
from googletrans import Translator
from translation import iciba
from PyDictionary import PyDictionary
import urllib.request
import urllib.parse
import json
import csv
from termcolor import colored
# 有道翻译
def youdao_translate(content):
youdao_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {}
data['i'] = content
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = '1525141473246'
data['sign'] = '47ee728a4465ef98ac06510bf67f3023'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data['typoResult'] = 'false'
data = urllib.parse.urlencode(data).encode('utf-8')
youdao_response = urllib.request.urlopen(youdao_url, data)
youdao_html = youdao_response.read().decode('utf-8')
target = json.loads(youdao_html)
trans = target['translateResult']
ret = ''
for i in range(len(trans)):
line = ''
for j in range(len(trans[i])):
line = trans[i][j]['tgt']
ret += line
return ret
def judge_pure_english(keyword):
for i in range(len(keyword)):
if keyword[i] > 'z' or keyword[i] < 'a':
return False
return True
def is_Chinese(word):
for ch in word:
if '\u4e00' <= ch <= '\u9fff':
return True
return False
tr = Translator()
dicts = {}
with open("Dic.csv") as l:
reader = csv.DictReader(l)
for row in reader:
dicts[row['Chinese']] = row['English']
with open("Newdic.csv", "r") as g:
reader = csv.DictReader(g)
for row in reader:
if not row["Chs"] in dicts:
dicts[row["Chs"]] = row["Eng"]
print(dicts['测试'])
def translate_with_dictionary():
f1 = open("chs.csv", "w")
f1.write("Word,Count,Trans\n")
dictionary = PyDictionary()
with open("chs-2.csv") as f:
for line in f:
s = re.findall(r".*?,", line)[0][:-1]
try:
s1 = dicts[s]
except KeyError:
s1 = 'NA'
# print(s1)
if s1 == None:
continue
# try:
# s1 = tr.translate(s, dest='zh-CN').text
# except json.decoder.JSONDecodeError:
# continue
# s1 = youdao_translate(s)
line = line[:-1]
f1.write(line + ',' + s1 + '\n')
# print(line + ',' + s1 + '\n')
def translate_with_google():
f1 = open("chs-final2.csv", "w")
f1.write("Word,Count,Trans\n")
dictionary = PyDictionary()
flag = 0
with open("chs-final.csv") as f:
for line in f:
s = re.findall(r".*?,", line)[0][:-1]
if (line[len(line) - 2] != 'A' or flag or s in dicts):
f1.write(line)
continue
try:
# s1 = youdao_translate(s).lower()
s1 = tr.translate(s, dest='zh-CN').text.lower()
except json.decoder.JSONDecodeError:
flag = 1
f1.write(line)
continue
# try:
# s1 = tr.translate(s, dest='zh-CN').text
# except json.decoder.JSONDecodeError:
# continue
print(s1)
if s1.find(" ") >= 0:
f1.write(line)
continue
if (is_Chinese(s1)):
f1.write(line)
dicts[s] = s1
continue
print(colored("Recodered!", "green"))
dicts[s] = s1
line = line[:-4]
f1.write(line + ',' + s1 + '\n')
f3 = open("Newdic.csv", "w")
f3.write("Chs,Eng\n")
for key in dicts:
f3.write(key + ',' + dicts[key] + '\n')
translate_with_google()
翻译完了还有一些问题,英语单词的时态会导致一些诡异的问题,而且翻译完了还会出现重叠的词汇,需要集合一下。
时态问题可以用词干化处理解决。用nltk的wordnet库
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
stem = lemmatizer.lemmatize(word)
于是就把stem后的单词记一下翻译,假如不一样就麻烦有道搞一波。flag表示是否翻译过
import csv
from nltk.stem import WordNetLemmatizer
import urllib.request
import urllib.parse
import json
# 有道翻译方法
def youdao_translate(content):
youdao_url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
data = {}
data['i'] = content
data['from'] = 'AUTO'
data['to'] = 'AUTO'
data['smartresult'] = 'dict'
data['client'] = 'fanyideskweb'
data['salt'] = '1525141473246'
data['sign'] = '47ee728a4465ef98ac06510bf67f3023'
data['doctype'] = 'json'
data['version'] = '2.1'
data['keyfrom'] = 'fanyi.web'
data['action'] = 'FY_BY_CLICKBUTTION'
data['typoResult'] = 'false'
data = urllib.parse.urlencode(data).encode('utf-8')
youdao_response = urllib.request.urlopen(youdao_url, data)
youdao_html = youdao_response.read().decode('utf-8')
target = json.loads(youdao_html)
trans = target['translateResult']
ret = ''
for i in range(len(trans)):
line = ''
for j in range(len(trans[i])):
line = trans[i][j]['tgt']
ret += line
return ret
lemmatizer = WordNetLemmatizer()
f1 = open("eng-merged.csv", "w")
dic = {}
timeout = 0
with open("eng-final.csv") as f:
reader = csv.DictReader(f)
for row in reader:
s_e = row['Word']
count = int(row['Count'])
s_c = row['Trans']
s_e = lemmatizer.lemmatize(s_e)
if not s_e in dic:
dic[s_e] = {'Count': count, "Trans": s_c, "Flag": 0}
else:
if dic[s_e]["Trans"] == s_c or dic[s_e]["Flag"] or timeout:
dic[s_e]['Count'] += count
else:
try:
new = youdao_translate(s_e)
print(new)
except json.decoder.JSONDecodeError:
timeout = 1
if timeout == 1:
dic[s_e]['Count'] += count
else:
dic[s_e]["Flag"] = 1
dic[s_e]["Trans"] = new
dic[s_e]['Count'] += count
f1.write("Eng,Chs,Count,Flag\n")
for key in dic:
f1.write(key + ',' + dic[key]["Trans"] + ',' + str(dic[key]["Count"]) + ',' + str(dic[key]["Flag"]) + '\n')
词云用R语言的wordcloud2化的,虽然python很简单,但既然学了统计。。。
首先database里面要有两个数组,word和freq,代码为
saveWidget(wordcloud2(database), file = "path", selfcontained = FALSE)
path要具体到文件,比如/Volumes/data/!!homework/test.html
可以加蒙版,改颜色背景啥的直接?wordcould2
看吧。。比较简单
最后说一句:人生苦短,我用python。
python真的爽。。。。