-
Notifications
You must be signed in to change notification settings - Fork 0
/
pubmedText.R
51 lines (44 loc) · 2.25 KB
/
pubmedText.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
library(ff)
pubmedText <- function(url) {
source("D:/Program Files/RStudio/RFile/textMining/regexp.R")
# get the article's pmid, title, abstract, journal name and
# publish date. url url 输入的网址链接 e.g.
# http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=24961184,24961183,24961148,24960992,24960298
# &retmode=xml&rettype=abstract
text <- readLines(url, encoding = "UTF-8")
# 读取网页内容,encoding
# 不设置会在使用substring()时出现multibite 读取错误
text <- paste(text, collapse = "") #多行文本连成一行,有利于使用正则表达式
# 将 文本按照 ariticle 进行分割,对每一个ariticle进行处理
articleStr <- "<PubmedArticle>.*?</PubmedArticle>"
article <- gregexpResult(articleStr, text) #自写函数
# pmid
pmidStr <- "(?<=<PMID Version=\"1\">)[0-9]*?(?=</PMID>)"
pmid <- regexpResult(pmidStr, article)
# <ArticleTitle>
titleStr <- "(?<=<ArticleTitle>).*?(?=</ArticleTitle>)"
title <- regexpResult(titleStr, article)
# </Abstract>
abstractStr <- "(?<=<Abstract>).*?(?=</Abstract>)"
abstract <- regexpResult(abstractStr, article)
abstract <- gsub(pattern = "(<AbstractText.*?>)*(</AbstractText>)*",
replacement = "", x = abstract, perl = T)
abstract <- gsub(pattern = "\\s+", replacement = " ", x = abstract,
perl = T)
# 多个空格替换成一个空格
# <Title>
journalStr <- "(?<=<ISOAbbreviation>).*?(?=</ISOAbbreviation>)"
journal <- regexpResult(journalStr, article)
# <PubDate>\n<Year>
yearStr <- "(?<=<PubDate>).*?(?=</PubDate>)"
year <- regexpResult(yearStr, article)
# 去除杂质 此处分多次剔除,写在一起出错了,错因不明待查
year <- gsub(pattern = "(<Year>)*(<Month>)*", replacement = "",
x = year, perl = T)
year <- gsub(pattern = "(<Year>)*(</Year>)*(<Month>)*(</Month>)*(<Day>.*</Day>)*(\\s+)",
replacement = "", x = year, perl = T)
year <- gsub(pattern = "(<MedlineDate>)*(</MedlineDate>)*",
replacement = "", x = year, perl = T)
pubmed <- data.frame(pmid, title, abstract, journal, year) #存储
bgtext <- as.ffdf(pubmed) #转为ffdf格式 (ff package)
}