R語言之計算分子指紋及批量保存sdf格式

主要使用rcdk: 一個化學包

# releases
install.packages("rcdk")

# development releases of `cdkr` are also available on github uinsg devtools:
library(devtools)
install_github("https://github.com/CDK-R/rcdklibs")
install_github("https://github.com/CDK-R/cdkr", subdir="rcdk")


#===========================================================================


#===========================================================================

rm(list=ls())
library(dplyr)
library(tibble)
library(tidyr)
set.seed(1234)
library(ROCR)
library(dplyr)
library(rcdk)
setwd('D:\\SCIwork\\F29\\lianxishuju')

#讀取數據
data <- read.csv('train.csv', header = T)
# 
# SMILES
# 1 [H][C@]12CCCN1CC1=C(C2)C2=C(C=C(OC)C(OC)=C2)C2=C1C=CC(OC)=C2
# 2         COC1=CC2=C(C=C1)C1=C(CC3CCCN3C1)C1=C2C=C(OC)C(OC)=C1
# 3         COC1=CC2=C(C=C1)C1=C(CN3CCCC3C1)C1=C2C=C(OC)C(OC)=C1
# 4       COC1=CC2=C(C=C1OC)C1=C(C=C(OC)C(OC)=C1)C1=C2CC2CCCN2C1
# 5          COC1=CC2=C(C=C1OC)C1=C(C=C(OC)C(OC)=C1)C(=C2)C(O)=O
# 6  COC1=CC2=C(C=C1OC)C1=C(C=C(OC)C(OC)=C1)C(=C2)C(=O)C1=CC=CN1

#去重
data <- data %>% distinct(SMILES, .keep_all = T)
write.csv(data, file = 'train.csv', row.names = F)


#讀取數據
data <- read.csv('train.csv', header = T)
SMILES <- data$SMILES

#iter_num設置爲smile的總數
iter_num <- 195
i = 1
mols <- parse.smiles(SMILES[i])
fp <- get.fingerprint (mols[[1]], type = 'standard', fp.mode = 'bit',depth = 6, size = 1024)
num <- fp@bits
dt <- data.frame(mol = 'fpt', fp = 1:1024)
dt$fp = 0 
dt$mol <- paste0(dt$mol, 1:1024)
dt[num, 'fp'] <- 1
rownames(dt) <- dt$mol
dt$mol <- NULL
names(dt)[1] <- paste0('molecule', i)


#合併數據框
for (i in 2:iter_num) {
  
  mols <- parse.smiles(SMILES[i])
  fp <- get.fingerprint (mols[[1]], type = 'standard', fp.mode = 'bit',depth = 6, size = 1024)
  num <- fp@bits
  dta <- data.frame(mol = 'fpt', fp = 1:1024)
  dta$fp = 0 
  dta$mol <- paste0(dta$mol, 1:1024)
  dta[num, 'fp'] <- 1
  rownames(dta) <- dta$mol
  dta$mol <- NULL
  names(dta)[1] <- paste0('molecule', i)
  dt <- cbind(dt, dta)
}

#統計單個指紋的分佈
dt$sum <- rowSums(dt)
table(dt$sum)

dt[1:6,1:6]
# molecule1 molecule2 molecule3 molecule4 molecule5 molecule6
# fpt1         0         0         0         0         0         0
# fpt2         0         0         0         0         0         0
# fpt3         0         0         0         0         0         0
# fpt4         0         0         0         0         0         0
# fpt5         0         0         0         0         0         0
# fpt6         0         0         0         0         0         0

setwd('D:\\SCIwork\\F29\\lianxishuju\\ligand')
#循環生成sdf文件
for (i in 1:iter_num) {
  m <- parse.smiles(SMILES[i])
  ## perform operations on this molecule
  file_name <- paste0('molecule', i, '.sdf')
  write.molecules(m,filename=file_name )
}





用DS2019讀取這些sdf文件後,全部visible,然後保存爲sdf格式,即可將所有小分子保存到一個sdf文件中。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章