We were curious how prevalent viruses are in Anaerobic Digesters. Here is my non-rigourous attempt at getting some answers.
I used MG-RAST webserver to select a subset of metagenomes. I searched for any terms containing “anaerobic digestion” and with “shotgun” in seq_type
. this yielded 212 metagenomes. I downloaded the metadata, and manually created a file called annotations.txt
that included just the mg-rast ID (which looks kinda like a checksum).
From there, I downloaded the summary statistics for the taxonomy using the (very fragile) mg-display-statistics.py
script.
mkdir display_output
while read x;
do
# its written in python2. Its 2018, and its written in python2.
`which python2` `which mg-display-statistics.py` --id $x --stat domain > display_output/${x}.txt ;
done < accessions.txt
Nest, I read in the files, made a combined table from all of them:
dir <- "./display_output/"
files <- list.files(dir)
df <- data.frame("id"=NA, "level"=NA, "value"=NA )
df <- df[!is.na(df), ]
for (f in files){
if (file.size(file.path(dir, f)) > 0){
tmp <- read.csv(
file.path(dir, f), sep="\t",
header=F, col.names = c("level", "value"))
# tmp <- rbind(tmp, data.frame(
# "level"="Total",
# "value"=sum(tmp$value)))
tmp$id <- f
df <- rbind(df, tmp)
}
}
str(df)
## 'data.frame': 931 obs. of 3 variables:
## $ level: Factor w/ 8 levels "Archaea","Bacteria",..: 2 1 3 4 2 3 1 2 3 1 ...
## $ value: int 76001207 1594049 343215 29310 7385 160 134 3490 271 12 ...
## $ id : chr "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" ...
…adding columns with the mgrast id and for the fractional abundances. From this we get an average viral abundance:
df <- df %>%
group_by(id) %>%
mutate(frac = value / sum(value)) %>%
as_data_frame()
str(df)
## Classes 'tbl_df', 'tbl' and 'data.frame': 931 obs. of 4 variables:
## $ level: Factor w/ 8 levels "Archaea","Bacteria",..: 2 1 3 4 2 3 1 2 3 1 ...
## $ value: int 76001207 1594049 343215 29310 7385 160 134 3490 271 12 ...
## $ id : chr "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" ...
## $ frac : num 0.974777 0.020445 0.004402 0.000376 0.961714 ...
df <- as.data.frame(df)
str(df)
## 'data.frame': 931 obs. of 4 variables:
## $ level: Factor w/ 8 levels "Archaea","Bacteria",..: 2 1 3 4 2 3 1 2 3 1 ...
## $ value: int 76001207 1594049 343215 29310 7385 160 134 3490 271 12 ...
## $ id : chr "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" ...
## $ frac : num 0.974777 0.020445 0.004402 0.000376 0.961714 ...
average_viral_frac = mean(df[df$level=="Viruses", "frac"])
Our average abundance is 0.0011495.
Lastly, we plot the abundances on a log scale.
ggplot(df, aes(x=level, y=frac * 100, group=level)) + geom_boxplot() + scale_y_log10(breaks=c( 0.001, .01, .1, 1, 100)) + coord_flip() +
geom_hline(aes(yintercept=average_viral_frac * 100,
color = "green"))+
scale_color_discrete(guide=F)+
labs(subtitle = paste("Domain diversity in ",length(files), "AD metagenomes "),
y="Percent of Total Reads",
x = "")
Next, we hope to calcalte what we can expect from a single flow cell. Our average percentage of viral DNA is {r average_viral_frac}
. What are the rarefaction curves looking like for these datasets?
we downloaded the data, again using the MG-RAST API.
mkdir rare_output;
while read x;
do
wget -O rare_output/${x}.txt "http://api-ui.mg-rast.org/metagenome/${x}?verbosity=stats&detail=rarefaction"
done < accessions.txt
Each file is a list of values, so we do some data munging to read it in to something useful.
target_coverage = 20
raredir <- "./rare_output/"
rarefiles <- list.files(raredir)
f <- rarefiles[1]
raredf <- data.frame("x"=NaN, "y"=NaN, "id"=NaN )
raredf <- raredf[!is.na(raredf), ]
for (f in rarefiles){
x <- readLines(file.path(raredir, f))
splits <- strsplit(x, "\\],\\[")[[1]]
# clean up first and last items
splits[1] <- substring(splits[1], first=3)
splits[length(splits)] <- substring(splits[length(splits)], first=1, last=nchar(splits[length(splits)]) - 2)
splitdf <- as.data.frame(splits) %>% separate(splits, into = paste("V", 1:2, sep = ","), sep = ",")
colnames(splitdf) <- c("x", "y")
splitdf$id <- f
raredf <- rbind(raredf, splitdf)
}
str(raredf)
## 'data.frame': 224079 obs. of 3 variables:
## $ x : chr "0" "29628" "59256" "88884" ...
## $ y : chr "2.76496777340435" "1847.17397431562" "2063.7714352611" "2211.80358929291" ...
## $ id: chr "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" "0013693d2c6d676d343731343433372e33.txt" ...
Lastly, we convert values to numeric, and draw a plot:
raredf$x <- as.numeric(raredf$x)
raredf$y <- as.numeric(raredf$y)
ggplot(raredf, aes(x=x, y=y, group=id)) + geom_line(color="black", alpha=.1)