Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
PanGeTools
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Alexis Mergez
PanGeTools
Commits
a7751c03
Commit
a7751c03
authored
9 months ago
by
Alexis Mergez
Browse files
Options
Downloads
Patches
Plain Diff
Update gaf2aln.py
parent
a12ec03c
No related branches found
No related tags found
1 merge request
!2
Update PanGeTools.def
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
gaf2aln.py
+104
-25
104 additions, 25 deletions
gaf2aln.py
with
104 additions
and
25 deletions
gaf2aln.py
+
104
−
25
View file @
a7751c03
...
...
@@ -167,6 +167,55 @@ links = {}
# Parsing the gfa
print
(
f
"
[gaf2aln::GFA Parser] Extracting nodes, paths and links ...
"
)
def
GFA_parser
(
gfa_lines
,
nodes
=
nodes
):
_links
,
_nodes
,
_nodes_length
,
paths
=
{},
{},
{},
{}
for
line
in
gfa_lines
:
line_content
=
line
[:
-
1
].
split
(
"
\t
"
)
line_id
=
line_content
[
0
]
# Segment line
if
line_id
==
"
S
"
:
_nodes_length
[
str
(
line_content
[
1
])]
=
len
(
line_content
[
2
])
# Link line
elif
line_id
==
"
L
"
:
try
:
_links
[
str
(
line_content
[
1
])][
str
(
line_content
[
3
])]
=
{
"
FROM
"
:
str
(
line_content
[
2
]),
"
TO
"
:
str
(
line_content
[
4
])
}
except
:
_links
[
str
(
line_content
[
1
])]
=
{
str
(
line_content
[
3
])
:
{
"
FROM.ORIENT
"
:
str
(
line_content
[
2
]),
"
TO.ORIENT
"
:
str
(
line_content
[
4
])}
}
# Path line
elif
line_id
==
"
P
"
:
_paths
[
str
(
line_content
[
1
])]
=
{
"
NODES
"
:
{
str
(
node_id
[:
-
1
]):
str
(
node_id
[
-
1
])
for
node_id
in
line_content
[
2
].
split
(
'
,
'
)
},
"
CIGAR
"
:
line_content
[
3
]
}
return
[
_links
,
_nodes
,
_nodes_length
,
_paths
]
# splits = np.quantile(range(len(gfa_lines)+1), q= np.array(args.threads+1)/args.threads, method='higher').tolist()
# res = []
# for i in range(1, len(splits)):
# res.append(executor.submit(GFA_parser, gfa_lines[splits[i-1]:splits[i]]))
# for out in res:
# results = out.result()
# for link_id, link_info in results[0].items():
# links[]
for
line
in
gfa_lines
:
line_content
=
line
[:
-
1
].
split
(
"
\t
"
)
line_id
=
line_content
[
0
]
...
...
@@ -201,27 +250,44 @@ for line in gfa_lines:
del
gfa_lines
print
(
f
"
[gaf2aln::Graph alignment processing] Computing nodes positions in each paths...
"
)
# Adding nodes positions relative to path
for
path_name
in
paths
.
keys
():
print
(
f
"
[gaf2aln::Graph processing] Running on
{
path_name
}
...
"
)
print
(
f
"
[gaf2aln::Graph position processing] Computing nodes positions in each paths...
"
)
def
get_node_pos
(
path_name
,
nodes
=
nodes
,
paths
=
paths
,
nodes_length
=
nodes_length
):
print
(
f
"
[gaf2aln::Graph position processing] Running on
{
path_name
}
...
"
)
cur_pos
=
0
out
=
{}
# Iterating over nodes in the path
for
path_node
in
paths
[
path_name
][
"
NODES
"
].
keys
():
# Instead of checking if the node is one interesting node, we try to add to the nodes dict
try
:
nodes
[
path_node
]
[
path_name
]
=
{
if
path_node
in
aln_nodes
:
out
[
path_node
]
=
{
"
START
"
:
cur_pos
,
# Start position of the node start in the currrent path
"
END
"
:
cur_pos
+
nodes_length
[
path_node
],
# End position of the node end in the current path
"
STRAND
"
:
paths
[
path_name
][
"
NODES
"
][
node
_id
]
# Orientation of the node in the current path
"
STRAND
"
:
paths
[
path_name
][
"
NODES
"
][
path_
node
]
# Orientation of the node in the current path
}
cur_pos
+=
nodes_length
[
path_node
]
+
1
e
xcept
:
e
lse
:
cur_pos
+=
nodes_length
[
path_node
]
+
1
print
(
f
"
[gaf2aln::Graph alignment processing] Computing nodes positions in each alignement...
"
)
return
out
res
=
{}
executor
=
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
args
.
threads
)
# Adding nodes positions relative to path
for
path_name
in
paths
.
keys
():
res
[
path_name
]
=
executor
.
submit
(
get_node_pos
,
path_name
)
executor
.
shutdown
(
wait
=
True
)
for
path_name
,
out
in
res
.
items
():
results
=
out
.
result
()
for
path_node
,
node_pos
in
results
.
items
():
nodes
[
path_node
][
path_name
]
=
node_pos
del
res
print
(
f
"
[gaf2aln::Alignment position processing] Computing nodes positions in each alignement...
"
)
# Adding nodes positions relative to path
def
get_aln_node_info
(
aln_name
,
aln_dict
=
aln_dict
,
nodes_length
=
nodes_length
):
...
...
@@ -271,14 +337,22 @@ def get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length
return
res
# Storing alignement
aln_processing
=
{}
res
=
{}
executor
=
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
args
.
threads
)
for
aln_name
in
aln_dict
.
keys
():
print
(
f
"
[gaf2aln::
Graph
processing] Running on
{
aln_name
}
...
"
)
print
(
f
"
[gaf2aln::
Alignment position
processing] Running on
{
aln_name
}
...
"
)
_
=
get_aln_node_info
(
aln_name
,
aln_dict
=
aln_dict
,
nodes_length
=
nodes_length
)
res
[
aln_name
]
=
executor
.
submit
(
get_aln_node_info
,
aln_name
)
#res[aln_name] = get_aln_node_info(aln_name, aln_dict = aln_dict, nodes_length = nodes_length)
executor
.
shutdown
(
wait
=
True
)
for
node_id
,
res
in
_
.
items
():
nodes
[
node_id
][
aln_name
]
=
res
for
aln_name
,
node_info
in
res
.
items
():
results
=
node_info
.
result
()
for
node_id
,
info
in
results
.
items
():
nodes
[
node_id
][
aln_name
]
=
info
del
res
# Calculating CIGAR for each nodes in each aln
print
(
f
"
[gaf2aln::CIGAR processing] Computing nodes cigar from alignement ...
"
)
...
...
@@ -306,7 +380,6 @@ ALNS = {}
for
aln_name
in
aln_dict
.
keys
():
for
path_name
in
paths
.
keys
():
ALNS
[(
path_name
,
aln_name
)]
=
[]
_
=
[]
for
node_id
,
orient
in
aln_dict
[
aln_name
][
"
PATH.MATCH
"
]:
...
...
@@ -316,7 +389,10 @@ for aln_name in aln_dict.keys():
q_end
=
n_info
[
aln_name
][
"
END
"
]
_CG
=
n_info
[
aln_name
][
"
CIGAR
"
]
try
:
print
(
node_id
,
path_name
,
q_start
,
q_end
)
if
path_name
in
list
(
n_info
.
keys
()):
print
(
"
\t
In path
"
)
if
n_info
[
aln_name
][
"
STRAND
"
]
==
n_info
[
path_name
][
"
STRAND
"
]
:
t_start
=
n_info
[
path_name
][
"
START
"
]
+
n_info
[
aln_name
][
"
S.OFF
"
]
t_end
=
n_info
[
path_name
][
"
END
"
]
+
n_info
[
aln_name
][
"
E.OFF
"
]
...
...
@@ -324,6 +400,8 @@ for aln_name in aln_dict.keys():
t_end
=
n_info
[
path_name
][
"
START
"
]
+
n_info
[
aln_name
][
"
S.OFF
"
]
t_start
=
n_info
[
path_name
][
"
END
"
]
+
n_info
[
aln_name
][
"
E.OFF
"
]
print
(
"
\t
"
,
t_start
,
t_end
)
# Non empty temporary list of aln and ending of the last block is the same as the start of the new node :
if
len
(
_
)
and
_
[
-
1
][
"
T.END
"
]
==
t_start
and
_
[
-
1
][
"
Q.END
"
]
==
q_start
:
tmp_aln
[
"
Q.END
"
]
=
q_end
...
...
@@ -343,17 +421,18 @@ for aln_name in aln_dict.keys():
"
T.END
"
:
t_end
,
"
CG
"
:
_CG
,
}
except
:
print
(
"
\t
"
,
tmp_aln
)
else
:
print
(
"
\t
Not in path
"
)
# Node is not in the path
tmp_aln
=
{
"
Q.START
"
:
q_start
,
"
Q.END
"
:
q_end
,
"
T.START
"
:
-
1
,
"
T.END
"
:
-
1
,
"
CG
"
:
f
"
{
nodes_length
[
node_id
]
}
D
"
}
print
(
ALNS
)
_
.
append
(
tmp_aln
)
ALNS
[(
path_name
,
aln_name
)]
=
_
## Debug
for
elem
in
ALNS
[(
"
TO1000#1#chr03
"
,
"
ALN_1
"
)]:
print
(
elem
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment