|
IEEE Transactions on Computer Aided Design of Integrated Circuits and Systems
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein
25(5):756–771
1990
AbstractA major obstacle to successful high-level synthesis (HLS) of large-scale application-specified integrated circuit systems is the presence of memory accesses to a shared-memory subsystem. The latency to access memory is often not statically predictable, which creates problems for scheduling operations dependent on memory reads. More fundamental is that dependences between accesses may not be statically provable (e.g., if the specification language permits pointers), which introduces memory-consistency problems. Addressing these issues with static scheduling results in overly conservative circuits, and thus, most state-of-the-art HLS tools limit memory systems to those that have predictable latencies and limit programmers to specifications that forbid arbitrary memory-reference patterns. A new HLS framework for the synthesis and optimization of memory accesses (SOMA) is presented. SOMA enables specifications to include arbitrary memory references (e.g., pointers) and allows the memory system to incorporate features that might cause the latency of a memory access to vary dynamically. This results in raising the level of abstraction in the input specification, enabling faster design times. SOMA synthesizes a memory access network (MAN) architecture that facilitates dynamic scheduling and ordering of memory accesses. The paper describes a basic MAN construction technique that illustrates how dynamic ordering helps in efficiently maintaining memory consistency and how dynamic scheduling helps alleviate the variable-latency problem. Then, it is shown how static analysis of the access patterns can be used to optimize the MAN. One optimization changes the MAN interconnect topology to increase concurrence. A second optimization reduces the synchronization overhead necessary to maintain memory consistency. Postlayout experiments demonstrate that SOMA’s application-specific MAN construction significantly improves power and performance for a range of benchmarks.
download pdf
@article{venkataramani-tcad06,
title = {Hardware Compilation of Application-Specific Memory Access
Interconnect},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
journal = {IEEE Transactions on Computer Aided Design of Integrated
Circuits and Systems},
year = {2006},
volume = {25},
number = {5},
pages = {756--771},
issn = {0278-0070},
abstract = {{A major obstacle to successful high-level synthesis
(HLS) of large-scale application-specified integrated circuit
systems is the presence of memory accesses to a shared-memory
subsystem. The latency to access memory is often not statically
predictable, which creates problems for scheduling operations
dependent on memory reads. More fundamental is that dependences
between accesses may not be statically provable (e.g., if the
specification language permits pointers), which introduces
memory-consistency problems. Addressing these issues with static
scheduling results in overly conservative circuits, and thus,
most state-of-the-art HLS tools limit memory systems to those
that have predictable latencies and limit programmers to
specifications that forbid arbitrary memory-reference patterns. A
new HLS framework for the synthesis and optimization of memory
accesses (SOMA) is presented. SOMA enables specifications to
include arbitrary memory references (e.g., pointers) and allows
the memory system to incorporate features that might cause the
latency of a memory access to vary dynamically. This results in
raising the level of abstraction in the input specification,
enabling faster design times. SOMA synthesizes a memory access
network (MAN) architecture that facilitates dynamic scheduling
and ordering of memory accesses. The paper describes a basic MAN
construction technique that illustrates how dynamic ordering
helps in efficiently maintaining memory consistency and how
dynamic scheduling helps alleviate the variable-latency problem.
Then, it is shown how static analysis of the access patterns can
be used to optimize the MAN. One optimization changes the MAN
interconnect topology to increase concurrence. A second
optimization reduces the synchronization overhead necessary to
maintain memory consistency. Postlayout experiments demonstrate
that SOMA's application-specific MAN construction significantly
improves power and performance for a range of benchmarks.}},
keywords = {Asychronous Circuits, Spatial
Computing,Phoenix,Network-on-a-chip},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tcad06.pdf},
}
Related Papers
Asychronous Circuits |
|
Heterogeneous Latch-Based Asynchronous Pipelines | pdf bib | |
Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
Asynchronous Circuits and Systems, International Symposium on,
pages 83–92, 1990.
|
| @inproceedings{venkataramani-async08,
author = {Venkataramani, Girish and Chelcea, Tiberiu and Goldstein,
Seth Copen},
title = {Heterogeneous Latch-Based Asynchronous Pipelines},
journal = {Asynchronous Circuits and Systems, International
Symposium on},
year = {2008},
issn = {1522-8681},
pages = {83--92},
keywords = {Asychronous Circuits},
doi = {http://doi.ieeecomputersociety.org/10.1109/ASYNC.2008.21},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
abstract = {We present a technique to automatically synthesize
heterogeneous asynchronous pipelines by combining two different
latching styles: normally open D-latches for high performance and
self-resetting D-latches for low power. Theformer is fast but
results in high power consumption due to data glitches that leak
through the latch when it is open. The latter is normally closed
and is opened just before data stabilizes. Thus, it is more
power-efficient but slower than normally open D-latches. We
propose a module selection optimization that assigns each
pipeline stage to one of these two latching styles. This is
performed by an automated algorithm that uses two types of
heuristics: (1) it uses the Global Critical Path (GCP), to assign
D-latches to stages that are sequentially critical, and (2) it
estimates potential datapath glitching to make SR-latch
assignment decisions. The algorithm has quadratic-time complexity
and experiments that apply the algorithm on several media
processing kernels indicate that, on average, the heterogeneous
pipelining algorithm achieves higher performance and is more
energy efficient than either the homogeneous D-latch or SR-latch
pipeline styles.},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-async08.pdf},
}
|
|
Slack Analysis in the System Design Loop | bib talk | |
Girish Venkataramani and Seth Copen Goldstein.
In IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS),
pages 231–236, Oct 1990.
|
| @inproceedings{venkataramani-codes08,
author = {Venkataramani, Girish and Goldstein, Seth Copen},
booktitle = {IEEE/ACM/IFIP International Conference on
Hardware/Software Codesign and System Synthesis {(CODES-ISSS)}},
year = {2008},
address = {Atlanta, GE},
month = {Oct},
keywords = {Asychronous Circuits, CAD, Global Critical Path},
title = {Slack Analysis in the System Design Loop},
talk = {http://www.cs.cmu.edu/~seth/papers/talk-venkataramani-codes08.pdf},
pages = {231--236},
}
|
|
Area Optimizations for Dual-Rail Circuits Using Relative-Timing Analysis | pdf bib | |
Tiberiu Chelcea, Girish Venkataramani, and Seth Copen Goldstein.
In Proceedings of the 13th IEEE International Symposium on Asynchronous Circuits and Systems,
pages 117–128, Mar 1990.
|
| @inproceedings{chelcea-async07,
author = {Chelcea, Tiberiu and Venkataramani, Girish and Goldstein,
Seth Copen},
title = {Area Optimizations for Dual-Rail Circuits Using
Relative-Timing Analysis},
booktitle = {Proceedings of the 13th IEEE International Symposium on
Asynchronous Circuits and Systems},
year = {2007},
address = {Berkeley, CA},
month = {Mar},
pages = {117--128},
abstract = {Future deep sub-micron technologies will be
characterized by large parametric variations, which could make
asynchronous design an attractive solution for use on large
scale. However, the investment in asynchronous CAD tools does not
approach that in synchronous ones. Even when asynchronous tools
leverage existing synchronous toolflows, they introduce large
area and speed overheads. This paper proposes several heuristic
and optimal algorithms, based on timing interval analysis, for
improving existing asynchronous CAD solutions by optimizing area.
The optimized circuits are 2.4 times smaller for an optimal
algorithm and 1.8 times smaller for a heuristic one than the
existing solutions. The optimized circuits are also shown to be
resilient to large parametric variations, yielding better
average-case latencies than their synchronous counterparts.},
url = {http://www.cs.cmu.edu/~seth/papers/chelcea-async07.pdf},
keywords = {Asychronous Circuits, CAD},
}
|
|
Global Critical Path: A Tool for System-Level Timing Analysis | pdf bib | |
Girish Venkataramani, Mihai Budiu, Tiberiu Chelcea, and Seth Copen Goldstein.
In Proceedings of the 44th ACM/IEEE Design Automation Conference,
pages 783–786, Jun 1990.
|
| @inproceedings{dac07-gcp,
author = {Venkataramani, Girish and Budiu, Mihai and Chelcea,
Tiberiu and Goldstein, Seth Copen},
title = {Global Critical Path: A Tool for System-Level Timing
Analysis},
booktitle = {Proceedings of the 44th ACM/IEEE Design Automation
Conference},
year = {2007},
month = {Jun},
address = {San Diego, CA},
pages = {783--786},
abstract = {An effective method for focusing optimization effort on
the most important parts of a design is to examine those elements
on the critical path. Traditionally, the critical path is defined
at the RTL level, as the longest path in the combinational logic
between clocked reisters. In this paper, we present a
system-level timing analysis technique to define the concept of a
Global Critical Path (GCP), for predicting system-level
performance. We show how the GCP can be used as a theoretical and
practical tool for understanding, summarizing and optimizing the
behavior of highly concurrent self-timed circuits. We formally
define the GCP and show how it can be constructed using a
discrete event model and hardware profiling techniques. The GCP
provides valuable insight into the control-path behavior of
circuits and in finding system-level bottlenecks. We have
incorporated the GCP construction and analysis framework into a
high-level synthesis and simulation toolchain, thus enabling
complete automation in modeling, analysis and optimization.},
url = {http://www.cs.cmu.edu/~seth/papers/dac07-gcp.pdf},
keywords = {Asychronous Circuits, CAD, Global Critical Path, System
modeling, Hardware profiling},
}
|
|
Operation Chaining Asynchronous Pipelined Circuits | pdf bib | |
Girish Venkataramani and Seth Copen Goldstein.
In ICCAD,
Nov 1990.
|
| @inproceedings{venkataramani-iccad07,
author = {Venkataramani, Girish and Goldstein, Seth Copen},
title = {Operation Chaining Asynchronous Pipelined Circuits},
booktitle = {ICCAD},
abstract = {We define operation chaining (op-chaining) as an
optimization problem to determine the optimal pipeline depth for
balancing performance against energy demands in pipelined
asynchronous designs. Since there are no clock period
requirements, asynchronous pipeline stages can have non-uniform
latencies. We exploit this fact to coalesce several stages
together thereby saving power and area due to the elimination of
control-path resources from the pipeline. The trade-off is
potentially reduced pipeline parallelism. In this paper, we
formally define this optimization as a graph covering problem,
which finds sub-graphs that will be synthesized as an opchained
pipeline stage. We then define the solution space for provably
correct solutions and present an algorithm to efficiently search
this space. The search technique partitions the graph based on
post-dominator relationships to find sub-graphs that are
potential op-chain candidates. We use knowledge of the Global
Critical Path (GCP) [13] to evaluate the performance impact of
accepting a candidate sub-graph and formulate a heuristic cost
function to model this trade-off. The algorithm has a
quadratic-time complexity in the size of the dataflow graph. We
have implemented this algorithm within an automated asynchronous
synthesis toolchain [12]. Experimental evidence from applying the
algorithm on several media processing kernels reveals that the
average energy-delay and energy-delay-area products improve by
about 1.4x and 1.8x respectively, with a maximum improvement of
5x and 18x.},
month = {Nov},
year = {2007},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iccad07.pdf},
keywords = {Asychronous Circuits, CAD, Global Critical Path},
}
|
|
Self-Resetting Latches for Asynchronous Micro-Pipelines | pdf bib | |
Tiberiu Chelcea, Girish Venkataramani, and Seth Copen Goldstein.
In Proceedings of the 44th ACM/IEEE Design Automation Conference,
pages 986–989, Jun 1990.
|
| @inproceedings{dac07-sr,
author = {Chelcea, Tiberiu and Venkataramani, Girish and Goldstein,
Seth Copen},
title = {Self-Resetting Latches for Asynchronous Micro-Pipelines},
booktitle = {Proceedings of the 44th ACM/IEEE Design Automation
Conference},
year = {2007},
month = {Jun},
address = {San Diego, CA},
pages = {986--989},
keywords = {Asychronous Circuits},
abstract = {Asynchronous circuits are increasingly attractive as low
power or high-performance replacements to synchronous designs. A
key part of these circuits are asynchronous micropipelines;
unfortunatelly, the existing micropipeline styles either improve
performance or decrease power consumption, but not both. Very
often, the pipeline register plays a crucial role in these cost
metrics. In this paper we introduce a new register design, called
self-resetting latches, for asynchronous micropipelines which
bridges the gap between fast, but power hungry, latch-based
designs and slow, but low power, flip-flop designs. The
energy-delay metric for large asynchronous systems implemented
with self-resetting latches is, on average, 41\% better than
latch-based designs and 15\% better than flip-flop designs.},
url = {http://www.cs.cmu.edu/~seth/papers/dac07-sr.pdf},
}
|
|
Hardware Compilation of Application-Specific Memory Access Interconnect | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
IEEE Transactions on Computer Aided Design of Integrated Circuits and Systems,
25(5):756–771, 1990.
|
| @article{venkataramani-tcad06,
title = {Hardware Compilation of Application-Specific Memory Access
Interconnect},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
journal = {IEEE Transactions on Computer Aided Design of Integrated
Circuits and Systems},
year = {2006},
volume = {25},
number = {5},
pages = {756--771},
issn = {0278-0070},
abstract = {{A major obstacle to successful high-level synthesis
(HLS) of large-scale application-specified integrated circuit
systems is the presence of memory accesses to a shared-memory
subsystem. The latency to access memory is often not statically
predictable, which creates problems for scheduling operations
dependent on memory reads. More fundamental is that dependences
between accesses may not be statically provable (e.g., if the
specification language permits pointers), which introduces
memory-consistency problems. Addressing these issues with static
scheduling results in overly conservative circuits, and thus,
most state-of-the-art HLS tools limit memory systems to those
that have predictable latencies and limit programmers to
specifications that forbid arbitrary memory-reference patterns. A
new HLS framework for the synthesis and optimization of memory
accesses (SOMA) is presented. SOMA enables specifications to
include arbitrary memory references (e.g., pointers) and allows
the memory system to incorporate features that might cause the
latency of a memory access to vary dynamically. This results in
raising the level of abstraction in the input specification,
enabling faster design times. SOMA synthesizes a memory access
network (MAN) architecture that facilitates dynamic scheduling
and ordering of memory accesses. The paper describes a basic MAN
construction technique that illustrates how dynamic ordering
helps in efficiently maintaining memory consistency and how
dynamic scheduling helps alleviate the variable-latency problem.
Then, it is shown how static analysis of the access patterns can
be used to optimize the MAN. One optimization changes the MAN
interconnect topology to increase concurrence. A second
optimization reduces the synchronization overhead necessary to
maintain memory consistency. Postlayout experiments demonstrate
that SOMA's application-specific MAN construction significantly
improves power and performance for a range of benchmarks.}},
keywords = {Asychronous Circuits, Spatial
Computing,Phoenix,Network-on-a-chip},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tcad06.pdf},
}
|
|
Leveraging Protocol Knowledge in Slack Matching | pdf bib | |
Girish Venkataramani and Seth Copen Goldstein.
In IEEE/ACM International Conference on Computer-Aided Design (ICCAD),
Nov 1990.
|
| @inproceedings{venkataramani-iccad06,
title = {Leveraging Protocol Knowledge in Slack Matching},
author = {Venkataramani, Girish and Goldstein, Seth Copen},
booktitle = {IEEE/ACM International Conference on Computer-Aided
Design (ICCAD)},
year = {2006},
address = {San Jose, CA},
month = {Nov},
abstract = {{Stalls, due to mis-matches in communication rates, are
a major performance obstacle in pipelined circuits. If the rate
of data production is faster than the rate of consumption, the
resulting design performs slower than when the communication rate
is matched. This can be remedied by inserting pipeline buffers
(to temporarily hold data), allowing the producer to proceed if
the consumer is not ready to accept data. The problem of deciding
which channels need these buffers (and how many) for an arbitrary
communication profile is called the slack matching problem; the
optimal solution to this problem has been shown to be
NP-complete. \par In this paper, we present a heuristic that uses
knowledge of the communication protocol to explicitly model these
bottlenecks, and an iterative algorithm to progressively remove
these bottlenecks by inserting buffers. We apply this algorithm
to asynchronous circuits, and show that it naturally handles
large designs with arbitrarily cyclic and acyclic topologies,
which exhibit various types of control choice. The heuristic is
efficient, achieving linear time complexity in practice, and
produces solutions that (a) achieve up to 60\% performance
speedup on large media processing kernels, and (b) can either be
verified to be optimal, or the approximation margin can be
bounded. }},
keywords = {Asychronous Circuits, Spatial Computing, CAD, Global
Critical Path},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iccad06.pdf},
}
|
|
Modeling the Global Critical Path in Concurrent Systems | pdf bib | |
Girish Venkataramani, Tiberiu Chelcea, Mihai Budiu, and Seth Copen Goldstein.
Carnegie Mellon University Technical Report No. CMU-CS-06-144,
Aug 1990.
|
| @techreport{venkataramani-tr06,
author = {Venkataramani, Girish and Chelcea, Tiberiu and Budiu,
Mihai and Goldstein, Seth Copen},
title = {Modeling the Global Critical Path in Concurrent Systems},
institution = {Carnegie Mellon University},
year = {2006},
number = {CMU-CS-06-144},
month = {Aug},
abstract = {We show how the global critical path can be used as a
practical tool for understanding, optimizing and summarizing the
behavior of highly concurrent self-timed circuits. Traditionally,
critical path analysis has been applied to DAGs, and thus was
constrained to combinatorial sub-circuits. We formally define the
global critical path (GCP) and show how it can be constructed
using only local information that is automatically derived
directly from the circuit. We introduce a form of Production
Rules, which can accurately determine the GCP for a given input
vector, even for modules which exhibit choice and early
termination. \par The GCP provides valuable insight into the
control behavior of the application, which help in formulating
new optimizations and re-formulating existing ones to use the GCP
knowledge. We have constructed a fully automated framework for
GCP detection and analysis, and have incorporated this framework
into a high-level synthesis tool-chain. We demonstrate the
effectiveness of the GCP framework by re-formulating two
traditional CAD optimizations to use the GCP, yielding efficient
algorithms which improve circuit power (by up to 9\%) and
performance (by up to 60\%) in our experiments.},
keywords = {Asychronous Circuits, Spatial Computing,CAD, Global
Critical Path},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tr06.pdf},
}
|
|
Tartan: Evaluating Spatial Computation for Whole Program Execution | pdf bib | |
Mahim Mishra, Timothy J Callahan, Tiberiu Chelcea, Girish Venkataramani, Mihai Budiu, and Seth Copen Goldstein.
In 12th ACM International Conference on Architecture Support for Programming Languages and Operating Systems (ASPLOS),
pages 163–174, Oct 1990.
|
| @inproceedings{mahim-asplos06,
title = {Tartan: Evaluating Spatial Computation for Whole Program
Execution},
author = {Mishra, Mahim and Callahan, Timothy J and Chelcea, Tiberiu
and Venkataramani, Girish and Budiu, Mihai and Goldstein, Seth
Copen},
booktitle = {12th ACM International Conference on Architecture
Support for Programming Languages and Operating Systems
(ASPLOS)},
year = {2006},
pages = {163--174},
address = {San Jose, CA},
month = {Oct},
abstract = {Spatial Computing (SC) has been shown to be an
energy-efficient model for implementing program kernels. In this
paper we explore the feasibility of using SC for more than small
kernels. To this end, we evaluate the performance and energy
efficiency of entire applications on Tartan, a general-purpose
architecture which integrates a reconfigurable fabric (RF) with a
superscalar core. Our compiler automatically partitions and
compiles an application into an instruction stream for the core
and a configuration for the RF. We use a detailed simulator to
capture both timing and energy numbers for all parts of the
system. \par Our results indicate that a hierarchical RF
architecture, designed around a scalable interconnect, is
instrumental in harnessing the benefits of spatial computation.
The interconnect uses static configuration and routing at the
lower levels and a packet-switched, dynamically-routed network at
the top level. Tartan is most energy-efficient when almost all of
the application is mapped to the RF, indicating the need for the
RF to support most general-purpose programming constructs. Our
initial investigation reveals that such a system can provide, on
average, an order of magnitude improvement in energy-delay
compared to an aggressive superscalar core on single-threaded
workloads.},
keywords = {Asychronous Circuits, Spatial Computing, Reconfigurable
Computing,Phoenix, Tartan},
url = {http://www.cs.cmu.edu/~seth/papers/mahim-asplos06.pdf},
}
|
|
Adding Faster with Application Specific Early Termination | pdf bib | |
David Ryan Koes, Tiberiu Chelcea, Charles Onyeama, and Seth Copen Goldstein.
Carnegie Mellon University Technical Report No. CMU-CS-05-101,
pages 20, May 1990.
|
| @techreport{koes-tr05,
author = {Koes, David Ryan and Chelcea, Tiberiu and Onyeama, Charles
and Goldstein, Seth Copen},
title = {Adding Faster with Application Specific Early Termination},
institution = {Carnegie Mellon University},
year = {2005},
number = {CMU-CS-05-101},
pages = {20},
month = {May},
url = {http://www.cs.cmu.edu/~seth/papers/koes-tr05.pdf},
abstract = {This paper presents a methodology for improving the
speed of high-speed adders. As a starting point, a previously
proposed method, called speculative completion, is used in which
fast- terminating additions are automatically detected. Unlike
the previous design, the method proposed in this paper is able to
adapt dynamically to (1) application-specific behavior and (2) to
adder- specific behavior, resulting in a higher detection rate of
fast additions and, consequently, a faster average-case speed for
addition. Our experimental results show detection rates of over
99\%, and adder average-case speed improvements of up to 14.\%.},
keywords = {Asychronous Circuits},
}
|
|
SOMA: A Tool for Synthesizing and Optimizing Memory Accesses in ASICs | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS),
pages 231–236, Sep 1990.
|
| @inproceedings{venkataramani-isss05,
title = {SOMA: A Tool for Synthesizing and Optimizing Memory
Accesses in ASICs},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
booktitle = {IEEE/ACM/IFIP International Conference on
Hardware/Software Codesign and System Synthesis (CODES-ISSS)},
year = {2005},
isbn = {1-59593-161-9},
pages = {231-236},
address = {Jersey City, NJ, USA},
month = {Sep},
abstract = {Arbitrary memory dependencies and variable latency
memory systems are major obstacles to the synthesis of
large-scale ASIC systems in high-level synthesis. This paper
presents SOMA, a synthesis framework for constructing Memory
Access Network (MAN) architectures that inherently enforce memory
consistency in the presence of dynamic memory access
dependencies. A fundamental bottleneck in any such network is
arbitrating between concurrent accesses to a shared memory
resource. To alleviate this bottleneck, SOMA uses an
application-specific concurrency analysis technique to predict
the dynamic memory parallelism profile of the application. This
is then used to customize the MAN architecture. Depending on the
parallelism profile, the MAN may be optimized for latency,
throughput or both. The optimized MAN is automatically
synthesized into gate-level structural Verilog using a flexible
library of network building blocks. SOMA has been successfully
integrated into an automated C-to-hardware synthesis flow, which
generates standard cell circuits from unrestricted ANSI-C
programs. Post-layout experiments demonstrate that application
specific MAN construction significantly improves power and
performance.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix,
CAD,Compilers:Memory Optimizations},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-isss05.pdf},
}
|
|
HLS Support for Unconstrained Memory Accesses | pdf bib | |
Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE 14th International Workshop on Logic Synthesis (IWLS),
Jun 1990.
|
| @inproceedings{venkataramani-iwls05,
title = {{HLS} Support for Unconstrained Memory Accesses},
author = {Venkataramani, Girish and Chelcea, Tiberiu and Goldstein,
Seth Copen},
booktitle = {IEEE 14th International Workshop on Logic Synthesis
(IWLS)},
year = {2005},
address = {Lake Arrowhead, CA},
month = {Jun},
abstract = {A major obstacle in high-level synthesis (HLS) of
large-scale ASIC systems is memory access patterns. Typically,
most state-of-the-art HLS tools impose constraints on the memory
references in the source application, requiring them to exhibit
predictable access patterns, and/or requiring dependencies
between them to be statically determinable. This paper addresses
the HLS problem when such constraints are relaxed. We present an
analysis infrastructure that can be used within any HLS toolflow
for synthesizing circuits from high-level abstractions, such as
ANSI-C, where no assumptions can be made about memory access
latencies, and where dependencies between memory references can
only be disambiguated dynamically at runtime (pointer aliasing).
We start by describing a generic framework to build a
dependence-aware, fully distributed, although often conservative,
memory-access network (MAN) for a given memory-dependence graph.
Then, we propose a suite of optimizations to customize the MAN
for the given specification. All these techniques guarantee
memory coherency. Experimental results on Mediabench benchmarks,
show that such an approach succeeds in maintaining high levels of
parallelism, while ensuring memory coherency. The optimizations
succeed in lowering the synchronization overhead by as much as
4x.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls05.pdf},
}
|
|
Spatial Computation | pdf bib | |
Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS),
pages 14–26, Oct 1990.
|
| @inproceedings{budiu-asplos04,
author = {Budiu, Mihai and Venkataramani, Girish and Chelcea,
Tiberiu and Goldstein, Seth Copen},
title = {Spatial Computation},
booktitle = {International Conference on Architectural Support for
Programming Languages and Operating Systems (ASPLOS)},
pages = {14--26},
month = {Oct},
address = {Boston, MA},
year = {2004},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-asplos04.pdf},
abstract = {This paper describes a computer architecture that relies
on the direct translation of high-level language programs into
{\em Spatial Computation} (SC) hardware structures. SC program
implementations are completely distributed, without any
centralized control. SC circuits are optimized for {\em wires} at
the expense of computation units. \par In this paper we
investigate a particular implementation SC structures called ASH
(Application-Specific Hardware). Under the assumption that
computation is cheaper than communication, ASH replicates
computation units to simplify interconnect, building a system
which uses very simple, completely dedicated communication
channels. As a consequence, communication on the datapath never
requires arbitration; the only arbitration required is for
accessing memory. ASH relies on very simple hardware primitives,
using no associative structures, no multiported register files,
no scheduling logic, no broadcast, and no clocks. As a
consequence, ASH hardware is fast and extremely power efficient.
\par In this work we demonstrate three features of ASH: (1) that
such architectures can be built by automatic compilation of C
programs, (2) that distributed computation is in some respects
fundamentally different from monolithic superscalar processors
and (3) that ASIC implementations of ASH use 3 orders of
magnitude less energy compared to high-end superscalar
processors, while being within a factor of two in performance.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix},
}
|
|
Translating ANSI C to Asynchronous Circuits | pdf bib | |
Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In 10th IEEE International Symposium on Asynchronous Circuits and Systems (ASYNC '04),
Apr 1990.
|
| @inproceedings{budiu-async04,
title = {Translating ANSI C to Asynchronous Circuits},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-async04.pdf},
booktitle = {10th IEEE International Symposium on Asynchronous
Circuits and Systems (ASYNC '04)},
author = {Budiu, Mihai and Venkataramani, Girish and Chelcea,
Tiberiu and Goldstein, Seth Copen},
address = {Crete, Greece},
year = {2004},
month = {Apr},
keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault
and Defect Tolerance,Phoenix,Reconfigurable Computing,Spatial
Computing},
}
|
|
C to Asynchronous Dataflow Circuits: An End-to-End Toolflow | pdf bib | |
Girish Venkataramani, Mihai Budiu, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE 13th International Workshop on Logic Synthesis (IWLS),
Jun 1990.
|
| @inproceedings{venkataramani-iwls04,
title = {{C} to Asynchronous Dataflow Circuits: An End-to-End
Toolflow},
author = {Venkataramani, Girish and Budiu, Mihai and Chelcea,
Tiberiu and Goldstein, Seth Copen},
booktitle = {IEEE 13th International Workshop on Logic Synthesis
(IWLS)},
address = {Temecula, CA},
month = {Jun},
year = {2004},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls04.pdf},
abstract = {We present a complete toolflow that translates ANSI-C
programs into asynchronous circuits. The toolflow is built around
a compiler that converts C into a functional dataflow
intermediate representation, exposing instruction-level, pipeline
and memory parallelism. The compiler performs optimizations and
converts the intermediate representation into pipelined
asynchronous circuits, with no centralized controllers. In the
resulting circuits, control is distributed, communication is
achieved through local wires, and arbitration for datapath
resources is unnecessary. Circuits automatically synthesized from
Mediabench kernels exhibit substantially better energy-delay than
either single-issue processors or aggressive superscalar cores.},
keywords = {Asychronous Circuits,Spatial Computing,Phoenix,CAD},
}
|
|
Molecules, Gates, Circuits, Computer | pdf bib | |
Seth Copen Goldstein and Mihai Budiu.
In Molecular Nanoelectronics,
Jan 1990.
|
| @incollection{goldstein-mn03,
title = {Molecules, Gates, Circuits, Computer},
url = {http://www.cs.cmu.edu/~seth/papers/goldstein-mn03.pdf},
booktitle = {Molecular Nanoelectronics},
author = {Goldstein, Seth Copen and Budiu, Mihai},
year = {2003},
editor = {Mark A. Reed and Takhee Lee},
publisher = {American Scientific Publishers},
address = {Stevenson Ranch, CA},
month = {Jan},
isbn = {1-588883-006-3},
keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault
and Defect Tolerance,Reconfigurable Computing,Spatial
Computing,electronic nanotechnology,molecular electronics},
}
|
Phoenix |
|
Hardware Compilation of Application-Specific Memory Access Interconnect | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
IEEE Transactions on Computer Aided Design of Integrated Circuits and Systems,
25(5):756–771, 1990.
|
| @article{venkataramani-tcad06,
title = {Hardware Compilation of Application-Specific Memory Access
Interconnect},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
journal = {IEEE Transactions on Computer Aided Design of Integrated
Circuits and Systems},
year = {2006},
volume = {25},
number = {5},
pages = {756--771},
issn = {0278-0070},
abstract = {{A major obstacle to successful high-level synthesis
(HLS) of large-scale application-specified integrated circuit
systems is the presence of memory accesses to a shared-memory
subsystem. The latency to access memory is often not statically
predictable, which creates problems for scheduling operations
dependent on memory reads. More fundamental is that dependences
between accesses may not be statically provable (e.g., if the
specification language permits pointers), which introduces
memory-consistency problems. Addressing these issues with static
scheduling results in overly conservative circuits, and thus,
most state-of-the-art HLS tools limit memory systems to those
that have predictable latencies and limit programmers to
specifications that forbid arbitrary memory-reference patterns. A
new HLS framework for the synthesis and optimization of memory
accesses (SOMA) is presented. SOMA enables specifications to
include arbitrary memory references (e.g., pointers) and allows
the memory system to incorporate features that might cause the
latency of a memory access to vary dynamically. This results in
raising the level of abstraction in the input specification,
enabling faster design times. SOMA synthesizes a memory access
network (MAN) architecture that facilitates dynamic scheduling
and ordering of memory accesses. The paper describes a basic MAN
construction technique that illustrates how dynamic ordering
helps in efficiently maintaining memory consistency and how
dynamic scheduling helps alleviate the variable-latency problem.
Then, it is shown how static analysis of the access patterns can
be used to optimize the MAN. One optimization changes the MAN
interconnect topology to increase concurrence. A second
optimization reduces the synchronization overhead necessary to
maintain memory consistency. Postlayout experiments demonstrate
that SOMA's application-specific MAN construction significantly
improves power and performance for a range of benchmarks.}},
keywords = {Asychronous Circuits, Spatial
Computing,Phoenix,Network-on-a-chip},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tcad06.pdf},
}
|
|
Tartan: Evaluating Spatial Computation for Whole Program Execution | pdf bib | |
Mahim Mishra, Timothy J Callahan, Tiberiu Chelcea, Girish Venkataramani, Mihai Budiu, and Seth Copen Goldstein.
In 12th ACM International Conference on Architecture Support for Programming Languages and Operating Systems (ASPLOS),
pages 163–174, Oct 1990.
|
| @inproceedings{mahim-asplos06,
title = {Tartan: Evaluating Spatial Computation for Whole Program
Execution},
author = {Mishra, Mahim and Callahan, Timothy J and Chelcea, Tiberiu
and Venkataramani, Girish and Budiu, Mihai and Goldstein, Seth
Copen},
booktitle = {12th ACM International Conference on Architecture
Support for Programming Languages and Operating Systems
(ASPLOS)},
year = {2006},
pages = {163--174},
address = {San Jose, CA},
month = {Oct},
abstract = {Spatial Computing (SC) has been shown to be an
energy-efficient model for implementing program kernels. In this
paper we explore the feasibility of using SC for more than small
kernels. To this end, we evaluate the performance and energy
efficiency of entire applications on Tartan, a general-purpose
architecture which integrates a reconfigurable fabric (RF) with a
superscalar core. Our compiler automatically partitions and
compiles an application into an instruction stream for the core
and a configuration for the RF. We use a detailed simulator to
capture both timing and energy numbers for all parts of the
system. \par Our results indicate that a hierarchical RF
architecture, designed around a scalable interconnect, is
instrumental in harnessing the benefits of spatial computation.
The interconnect uses static configuration and routing at the
lower levels and a packet-switched, dynamically-routed network at
the top level. Tartan is most energy-efficient when almost all of
the application is mapped to the RF, indicating the need for the
RF to support most general-purpose programming constructs. Our
initial investigation reveals that such a system can provide, on
average, an order of magnitude improvement in energy-delay
compared to an aggressive superscalar core on single-threaded
workloads.},
keywords = {Asychronous Circuits, Spatial Computing, Reconfigurable
Computing,Phoenix, Tartan},
url = {http://www.cs.cmu.edu/~seth/papers/mahim-asplos06.pdf},
}
|
|
Dataflow: A Complement to Superscalar | pdf bib | |
Mihai Budiu, Pedro V. Artigas, and Seth Copen Goldstein.
In IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS),
pages 177–186, Mar 1990.
|
| @inproceedings{budiu-ispass05,
author = {Budiu, Mihai and Artigas, Pedro V. and Goldstein, Seth
Copen},
title = {Dataflow: A Complement to Superscalar},
booktitle = {IEEE International Symposium on Performance Analysis of
Systems and Software (ISPASS)},
month = {Mar},
year = {2005},
pages = {177--186},
address = {Austin, TX},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-ispass05.pdf},
abstract = {There has been a resurgence of interest in dataflow
architectures, because of their potential for exploiting
parallelism with low overhead. In this paper we analyze the
performance of a class of static dataflow machines on integer
media and control-intensive programs and we explain why a
dataflow machine, even with unlimited resources, does not always
outperform a superscalar processor on general-purpose codes,
under the assumption that both machines take the same time to
execute basic operations. We compare a program-specific dataflow
machine with unlimited parallelism to a superscalar processor
running the same program. While the dataflow machines provide
very good performance on most data-parallel programs, we show
that the dataflow machine cannot always take advantage of the
available parallelism. Using the dynamic critical path we
investigate the mechanisms used by superscalar processors to
provide a performance advantage and their impact on a dataflow
model.},
confweb = {http://www.ispass.org/ispass2005},
keywords = {Spatial Computing,Phoenix},
}
|
|
Inter-iteration Scalar Replacement in the Presence of Conditional Control Flow | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
In 3rd Workshop on Optimizations for DSO and Embedded Systems,
Mar 1990.
Also appeared as CMU CS Technical Report, CMU-CS-04-103.
|
| @inproceedings{budiu-odes05,
title = {Inter-iteration Scalar Replacement in the Presence of
Conditional Control Flow},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-odes05.pdf},
booktitle = {3rd Workshop on Optimizations for DSO and Embedded
Systems},
author = {Budiu, Mihai and Goldstein, Seth Copen},
year = {2005},
address = {San Jose, CA},
month = {Mar},
also = {CMU CS Technical Report, CMU-CS-04-103},
keywords = {Phoenix,Compilers:Loop Optimizations,Compilers:Scalar
Replacement},
}
|
|
SOMA: A Tool for Synthesizing and Optimizing Memory Accesses in ASICs | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS),
pages 231–236, Sep 1990.
|
| @inproceedings{venkataramani-isss05,
title = {SOMA: A Tool for Synthesizing and Optimizing Memory
Accesses in ASICs},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
booktitle = {IEEE/ACM/IFIP International Conference on
Hardware/Software Codesign and System Synthesis (CODES-ISSS)},
year = {2005},
isbn = {1-59593-161-9},
pages = {231-236},
address = {Jersey City, NJ, USA},
month = {Sep},
abstract = {Arbitrary memory dependencies and variable latency
memory systems are major obstacles to the synthesis of
large-scale ASIC systems in high-level synthesis. This paper
presents SOMA, a synthesis framework for constructing Memory
Access Network (MAN) architectures that inherently enforce memory
consistency in the presence of dynamic memory access
dependencies. A fundamental bottleneck in any such network is
arbitrating between concurrent accesses to a shared memory
resource. To alleviate this bottleneck, SOMA uses an
application-specific concurrency analysis technique to predict
the dynamic memory parallelism profile of the application. This
is then used to customize the MAN architecture. Depending on the
parallelism profile, the MAN may be optimized for latency,
throughput or both. The optimized MAN is automatically
synthesized into gate-level structural Verilog using a flexible
library of network building blocks. SOMA has been successfully
integrated into an automated C-to-hardware synthesis flow, which
generates standard cell circuits from unrestricted ANSI-C
programs. Post-layout experiments demonstrate that application
specific MAN construction significantly improves power and
performance.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix,
CAD,Compilers:Memory Optimizations},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-isss05.pdf},
}
|
|
HLS Support for Unconstrained Memory Accesses | pdf bib | |
Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE 14th International Workshop on Logic Synthesis (IWLS),
Jun 1990.
|
| @inproceedings{venkataramani-iwls05,
title = {{HLS} Support for Unconstrained Memory Accesses},
author = {Venkataramani, Girish and Chelcea, Tiberiu and Goldstein,
Seth Copen},
booktitle = {IEEE 14th International Workshop on Logic Synthesis
(IWLS)},
year = {2005},
address = {Lake Arrowhead, CA},
month = {Jun},
abstract = {A major obstacle in high-level synthesis (HLS) of
large-scale ASIC systems is memory access patterns. Typically,
most state-of-the-art HLS tools impose constraints on the memory
references in the source application, requiring them to exhibit
predictable access patterns, and/or requiring dependencies
between them to be statically determinable. This paper addresses
the HLS problem when such constraints are relaxed. We present an
analysis infrastructure that can be used within any HLS toolflow
for synthesizing circuits from high-level abstractions, such as
ANSI-C, where no assumptions can be made about memory access
latencies, and where dependencies between memory references can
only be disambiguated dynamically at runtime (pointer aliasing).
We start by describing a generic framework to build a
dependence-aware, fully distributed, although often conservative,
memory-access network (MAN) for a given memory-dependence graph.
Then, we propose a suite of optimizations to customize the MAN
for the given specification. All these techniques guarantee
memory coherency. Experimental results on Mediabench benchmarks,
show that such an approach succeeds in maintaining high levels of
parallelism, while ensuring memory coherency. The optimizations
succeed in lowering the synchronization overhead by as much as
4x.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls05.pdf},
}
|
|
Defect Tolerance at the End of the Roadmap | bib | |
Mahim Mishra and Seth Copen Goldstein.
In Nano, Quantum and Molecular Computing: Implications to High Level Design and Validation,
1990.
|
| @incollection{mishra-nqmc04,
title = {Defect Tolerance at the End of the Roadmap},
booktitle = {Nano, Quantum and Molecular Computing: Implications to
High Level Design and Validation},
author = {Mishra, Mahim and Goldstein, Seth Copen},
year = {2004},
editor = {Sandeep K. Shukla and R. Iris Bahar},
publisher = {Kluwer Academic Publishers},
isbn = {1-4020-80670},
keywords = {Electronic Nanotechnology,Fault and Defect
Tolerance,Reconfigurable Computing,Phoenix,molecular
electronics},
}
|
|
Inter-Iteration Scalar Replacement in the Presence of Conditional Control-Flow | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
Carnegie Mellon University Technical Report,
Feb 1990.
See budiu-odes05.
|
| @techreport{budiu-tr04,
title = {Inter-Iteration Scalar Replacement in the Presence of
Conditional Control-Flow},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-tr04.pdf},
booktitle = {CMU CS Technical Report, CMU-CS-04-103},
month = {Feb},
year = {2004},
author = {Budiu, Mihai and Goldstein, Seth Copen},
institution = {Carnegie Mellon University},
see = {budiu-odes05},
keywords = {Phoenix,Compilers:Loop Optimizations,Compilers:Scalar
Replacement},
}
|
|
Programmer Specified Pointer Independence | pdf bib | |
David Ryan Koes, Mihai Budiu, Girish Venkataramani, and Seth Copen Goldstein.
In Proceedings of the 2004 workshop on Memory system performance (MSP),
pages 51–59, Jun 1990.
Also appeared as Carnegie Mellon University TR CMU-CS-03-123.
|
| @inproceedings{koes-msp2004,
author = {Koes, David Ryan and Budiu, Mihai and Venkataramani,
Girish and Goldstein, Seth Copen},
title = {Programmer Specified Pointer Independence},
booktitle = {Proceedings of the 2004 workshop on Memory system
performance (MSP)},
month = {Jun},
year = {2004},
isbn = {1-58113-941-1},
pages = {51--59},
address = {Washington, D.C.},
doi = {http://doi.acm.org/10.1145/1065895.1065905},
also = {Carnegie Mellon University TR CMU-CS-03-123},
url = {http://www.cs.cmu.edu/~seth/papers/koes-msp2004.pdf},
confweb = {http://cs.anu.edu.au/~Steve.Blackburn/msp2004},
publisher = {ACM Press},
abstract = {Good alias analysis is essential in order to achieve
high performance on modern processors, yet precise
interprocedural analysis does not scale well. We present a source
code annotation, {\tt \#pragma independent}, which provides
precise pointer aliasing information to the compiler, and
describe a tool which highlights the most important and most
likely correct locations at which a programmer should insert
these annotations. Using this tool we perform a limit study on
the effectiveness of pointer independence in improving program
performance through improved compilation.},
keywords = {Compilers:Alias Analysis,Phoenix},
}
|
|
Spatial Computation | pdf bib | |
Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS),
pages 14–26, Oct 1990.
|
| @inproceedings{budiu-asplos04,
author = {Budiu, Mihai and Venkataramani, Girish and Chelcea,
Tiberiu and Goldstein, Seth Copen},
title = {Spatial Computation},
booktitle = {International Conference on Architectural Support for
Programming Languages and Operating Systems (ASPLOS)},
pages = {14--26},
month = {Oct},
address = {Boston, MA},
year = {2004},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-asplos04.pdf},
abstract = {This paper describes a computer architecture that relies
on the direct translation of high-level language programs into
{\em Spatial Computation} (SC) hardware structures. SC program
implementations are completely distributed, without any
centralized control. SC circuits are optimized for {\em wires} at
the expense of computation units. \par In this paper we
investigate a particular implementation SC structures called ASH
(Application-Specific Hardware). Under the assumption that
computation is cheaper than communication, ASH replicates
computation units to simplify interconnect, building a system
which uses very simple, completely dedicated communication
channels. As a consequence, communication on the datapath never
requires arbitration; the only arbitration required is for
accessing memory. ASH relies on very simple hardware primitives,
using no associative structures, no multiported register files,
no scheduling logic, no broadcast, and no clocks. As a
consequence, ASH hardware is fast and extremely power efficient.
\par In this work we demonstrate three features of ASH: (1) that
such architectures can be built by automatic compilation of C
programs, (2) that distributed computation is in some respects
fundamentally different from monolithic superscalar processors
and (3) that ASIC implementations of ASH use 3 orders of
magnitude less energy compared to high-end superscalar
processors, while being within a factor of two in performance.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix},
}
|
|
Translating ANSI C to Asynchronous Circuits | pdf bib | |
Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In 10th IEEE International Symposium on Asynchronous Circuits and Systems (ASYNC '04),
Apr 1990.
|
| @inproceedings{budiu-async04,
title = {Translating ANSI C to Asynchronous Circuits},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-async04.pdf},
booktitle = {10th IEEE International Symposium on Asynchronous
Circuits and Systems (ASYNC '04)},
author = {Budiu, Mihai and Venkataramani, Girish and Chelcea,
Tiberiu and Goldstein, Seth Copen},
address = {Crete, Greece},
year = {2004},
month = {Apr},
keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault
and Defect Tolerance,Phoenix,Reconfigurable Computing,Spatial
Computing},
}
|
|
C to Asynchronous Dataflow Circuits: An End-to-End Toolflow | pdf bib | |
Girish Venkataramani, Mihai Budiu, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE 13th International Workshop on Logic Synthesis (IWLS),
Jun 1990.
|
| @inproceedings{venkataramani-iwls04,
title = {{C} to Asynchronous Dataflow Circuits: An End-to-End
Toolflow},
author = {Venkataramani, Girish and Budiu, Mihai and Chelcea,
Tiberiu and Goldstein, Seth Copen},
booktitle = {IEEE 13th International Workshop on Logic Synthesis
(IWLS)},
address = {Temecula, CA},
month = {Jun},
year = {2004},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls04.pdf},
abstract = {We present a complete toolflow that translates ANSI-C
programs into asynchronous circuits. The toolflow is built around
a compiler that converts C into a functional dataflow
intermediate representation, exposing instruction-level, pipeline
and memory parallelism. The compiler performs optimizations and
converts the intermediate representation into pipelined
asynchronous circuits, with no centralized controllers. In the
resulting circuits, control is distributed, communication is
achieved through local wires, and arbitration for datapath
resources is unnecessary. Circuits automatically synthesized from
Mediabench kernels exhibit substantially better energy-delay than
either single-issue processors or aggressive superscalar cores.},
keywords = {Asychronous Circuits,Spatial Computing,Phoenix,CAD},
}
|
|
Defect Tolerance After the Roadmap | pdf bib | |
Mahim Mishra and Seth Copen Goldstein.
In Proceedings of the 10th International Test Synthesis Workshop (ITSW),
Mar 1990.
|
| @inproceedings{mishra-itsw03,
author = {Mishra, Mahim and Goldstein, Seth Copen},
title = {Defect Tolerance After the Roadmap},
booktitle = {Proceedings of the 10th International Test Synthesis
Workshop (ITSW)},
month = {Mar},
year = {2003},
address = {Santa Barbara, {CA}},
keywords = {Spatial Computing, Reconfigurable Computing,Phoenix,
Fault and Defect Tolerance},
url = {http://www.cs.cmu.edu/~seth/papers/mishra-itsw03.pdf},
}
|
|
Defect Tolerance at the End of the Roadmap | pdf bib | |
Mahim Mishra and Seth Copen Goldstein.
In Proceedings of the International Test Conference (ITC), 2003,
Sep 1990.
|
| @inproceedings{mishra-itc03,
author = {Mishra, Mahim and Goldstein, Seth Copen},
title = {Defect Tolerance at the End of the Roadmap},
booktitle = {Proceedings of the International Test Conference
({ITC}), 2003},
month = {Sep},
year = {2003},
address = {Charlotte, {NC}},
url = {http://www.cs.cmu.edu/~seth/papers/mishra-itc03.pdf},
abstract = {Defect tolerance will become more important as feature
sizes shrink closer to single digit nanometer dimensions. This is
true whether the chips are manufactured using top-down methods
(e.g., photolithography) or bottom-up methods (e.g., chemically
assembled electronic nanotechnology, or CAEN). In this paper, we
propose a defect tolerance methodology centered around
reconfigurable devices, a scalable testing method, and dynamic
place-and-route. Our methodology is particularly well suited for
CAEN.},
keywords = {Spatial Computing, Reconfigurable
Computing,Phoenix,Fault and Defect Tolerance},
}
|
|
Optimizing Memory Accesses For Spatial Computation | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
In Proceedings of the 1st International ACM/IEEE Symposium on Code Generation and Optimization (CGO 03),
pages 216–227, Mar 1990.
|
| @inproceedings{budiu-cgo03,
title = {Optimizing Memory Accesses For Spatial Computation},
author = {Budiu, Mihai and Goldstein, Seth Copen},
booktitle = {Proceedings of the 1st International ACM/IEEE Symposium
on Code Generation and Optimization (CGO 03)},
year = {2003},
address = {San Francisco, CA},
month = {Mar},
pages = {216-227},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-cgo03.pdf},
keywords = {Spatial Computing, Reconfigurable
Computing,Phoenix,Compilers:Memory Optimizations},
}
|
|
Compiling Application-Specific Hardware | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
In Proceedings of the 12th International Conference on Field Programmable Logic and Applications,
pages 853–863, Sep 1990.
|
| @inproceedings{budiu-fpl02,
author = {Budiu, Mihai and Goldstein, Seth Copen},
title = {Compiling Application-Specific Hardware},
booktitle = {Proceedings of the 12th International Conference on
Field Programmable Logic and Applications},
year = {2002},
address = {Montpellier (La Grande-Motte), France},
month = {Sep},
pages = {853--863},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-fpl02.pdf},
abstract = {In this paper we describe ASH, an architectural
framework for implementing Application-Specific Hardware. ASH is
based on automatic hardware synthesis from high-level languages.
The generated circuits use only localized computation structures;
in consequence, we expect these circuits to be fast, to use
little power and to scale well with program complexity. \par We
present in detail CASH, a scalable compiler framework for ASH,
which generates hardware from programs written in C. Our compiler
exploits instruction level parallelism by using aggressive
speculation and dynamic scheduling. Based on this compilation
scheme, we evaluate the computational resources necessary for
implementing complex integer-based programs, and we suggest
architectural features that would support the ASH framework.},
keywords = {Spatial Computing,Phoenix,Compilers:CASH},
}
|
|
Factors Influencing the Performance of a CPU-RFU Hybrid Architecture | pdf bib | |
Girish Venkataramani, Suraj Sudhir, Mihai Budiu, and Seth Copen Goldstein.
In Proceedings of the 12th International Conference on Field Programmable Logic and Applications (FPL),
pages 955–965, Sep 1990.
|
| @inproceedings{venkataramani-fpl02,
title = {Factors Influencing the Performance of a CPU-RFU Hybrid
Architecture},
author = {Venkataramani, Girish and Sudhir, Suraj and Budiu, Mihai
and Goldstein, Seth Copen},
booktitle = {Proceedings of the 12th International Conference on
Field Programmable Logic and Applications (FPL)},
year = {2002},
address = {Montpellier (La Grande-Motte), France},
month = {Sep},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-fpl02.pdf},
abstract = {Closely coupling a reconfigurable fabric with a
conventional processor has been shown to successfully improve the
system performance. However, today s superscalar pro-cessors are
both complex and adept at extracting Instruction Level
Parallelism (ILP), which introduces many complex issues to the
design of a hybrid CPU-RFU system. This paper examines the design
of a superscalar processor augmented with a closely-coupled
recon-figurable fabric. It identifies architectural and compiler
issues that affect the performance of the overall system.
Previous efforts at combining a processor core with a
reconfigurable fabric are examined in the light of these issues.
We also present simulation results that emphasize the impact of
these factors.},
pages = {955-965},
isbn = {3-540-44108-5},
publisher = {Springer-Verlag},
keywords = {Spatial Computing,Reconfigurable Computing,Phoenix},
}
|
|
Pegasus: An Efficient Intermediate Representation | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
Carnegie Mellon University Technical Report No. CMU-CS-02-107,
pages 20, May 1990.
|
| @techreport{budiu-tr02,
author = {Budiu, Mihai and Goldstein, Seth Copen},
title = {Pegasus: An Efficient Intermediate Representation},
institution = {Carnegie Mellon University},
year = {2002},
number = {CMU-CS-02-107},
month = {May},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-tr02.pdf},
pages = {20},
abstract = {We present Pegasus, a compact and expressive
intermediate representation for imperative languages. The
representation is suitable for target architectures supporting
predicated execution and aggressive speculation. In Pegasus
information about the global dataflow of the program is encoded
in local structures, enabling compact and efficient algorithms
for program optimizations. As a proof of the versatility of
Pegasus, we have used it in a compiler translating C programs to
hardware implementations.},
keywords = {Spatial Computing, Reconfigurable Computing,Phoenix},
}
|
|
Scalable Defect Tolerance for Molecular Electronics | pdf bib | |
Mahim Mishra and Seth Copen Goldstein.
In Proceedings of the 1st Workshop on Non-Silicon Computing (NSC-1),
1990.
|
| @inproceedings{mishra_goldstein_nsc1,
author = {Mishra, Mahim and Goldstein, Seth Copen},
title = {Scalable Defect Tolerance for Molecular Electronics},
booktitle = {Proceedings of the 1st Workshop on Non-Silicon
Computing (NSC-1)},
address = {{Cambridge, MA}},
year = {2002},
url = {http://www.cs.cmu.edu/~seth/papers/mishra_goldstein_nsc1.pdf},
abstract = {Chemically assembled electronic nanotechnology (CAEN) is
a promising alternative to CMOS-based computing. However,
CAEN-based circuits are expected to have huge defect densities.
To solve this problem CAEN can be used to build reconfigurable
fabrics which, assuming the defects can be found, are inherently
defect tolerant. In this paper, we propose a scalable testing
methodology for finding defects in reconfigurable devices.},
keywords = {Reconfigurable Computing, Phoenix,Fault and Defect
Tolerance},
}
|
|
NanoFabrics: Spatial Computing Using Molecular Electronics | pdf bib | |
Seth Copen Goldstein and Mihai Budiu.
In Proceedings of the 28th International Symposium on Computer Architecture (ISCA),
pages 178–189, Jul 1990.
|
| @inproceedings{goldstein-isca01,
author = {Goldstein, Seth Copen and Budiu, Mihai},
title = {{NanoFabrics}: Spatial Computing Using Molecular
Electronics},
booktitle = {Proceedings of the 28th International Symposium on
Computer Architecture (ISCA)},
month = {Jul},
address = {{G\"{o}teborg, Sweden}},
year = {2001},
pages = {178--189},
abstract = {The continuation of the remarkable exponential increases
in processing power over the recent past faces imminent
challenges due in part to the physics of deep-submicron CMOS
devices and the costs of both chip masks and future fabrication
plants. A promising solution to these problems is offered by an
alternative to CMOS-based computing, chemically assembled
electronic nanotechnology (CAEN). In this paper we outline how
CAEN based computing can become a reality. We briefly describe
recent work in CAEN and how CAEN will affect computer
architecture. We show how the inherently reconfigurable natures
of CAEN devices can be exploited to provide high-density chips
with defect tolerance which will significantly reduce the cost of
manufacturing. After developing the basic building blocks of a
CAEN based computing devices we present some preliminary results
which indicate that CAEN based computing devices can meet or
exceed the performance of CMOS based devices.},
url = {http://www.cs.cmu.edu/~seth/papers/goldstein-isca01.pdf},
keywords = {Spatial Computing, Reconfigurable Computing,Phoenix,
Electronic Nanotechnology},
}
|
|
BitValue Inference: Detecting and Exploiting Narrow Bitwidth Computations | pdf bib | |
Mihai Budiu, Majd Sakr, Kevin Walker, and Seth Copen Goldstein.
In Proceedings of the 2000 Europar Conference,
volume 1900, pages 969–979, Aug 1990.
Also appeared as CMU CS Technical Report, CMU-CS-00-141, October 2000..
|
| @inproceedings{budiu-europar00,
title = {{BitValue} Inference: Detecting and Exploiting Narrow
Bitwidth Computations},
author = {Budiu, Mihai and Sakr, Majd and Walker, Kevin and
Goldstein, Seth Copen},
booktitle = {Proceedings of the 2000 Europar Conference},
year = {2000},
volume = {1900},
pages = {969--979},
month = {Aug},
issn = {0302-9743},
series = {Lecture Notes in Computer Science},
publisher = {Springer Verlag},
address = {Munich, Germany},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-europar00.pdf},
also = {CMU CS Technical Report, CMU-CS-00-141, October 2000.},
abstract = {We present a compiler algorithm called BitValue, which
can discover both unused and constant bits in dusty-deck C
programs. BitValue uses forward and backward dataflow analyses,
generalizing constant-folding and dead-code detection at the
bit-level. This algorithm enables compiler optimizations which
target special processor architectures for computing on
non-standard bitwidths. Using this algorithm we show that up to
31\% of the computed bytes are thrown away (for programs from
SpecINT95 and Mediabench). A compiler for reconfigurable hardware
uses this algorithm to achieve substantial reductions (up to
20-fold) in the size of the synthesized circuits.},
keywords = {Spatial Computing,Reconfigurable
Computing,Phoenix,PipeRench,CAD},
}
|
Network-on-a-chip |
|
Hardware Compilation of Application-Specific Memory Access Interconnect | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
IEEE Transactions on Computer Aided Design of Integrated Circuits and Systems,
25(5):756–771, 1990.
|
| @article{venkataramani-tcad06,
title = {Hardware Compilation of Application-Specific Memory Access
Interconnect},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
journal = {IEEE Transactions on Computer Aided Design of Integrated
Circuits and Systems},
year = {2006},
volume = {25},
number = {5},
pages = {756--771},
issn = {0278-0070},
abstract = {{A major obstacle to successful high-level synthesis
(HLS) of large-scale application-specified integrated circuit
systems is the presence of memory accesses to a shared-memory
subsystem. The latency to access memory is often not statically
predictable, which creates problems for scheduling operations
dependent on memory reads. More fundamental is that dependences
between accesses may not be statically provable (e.g., if the
specification language permits pointers), which introduces
memory-consistency problems. Addressing these issues with static
scheduling results in overly conservative circuits, and thus,
most state-of-the-art HLS tools limit memory systems to those
that have predictable latencies and limit programmers to
specifications that forbid arbitrary memory-reference patterns. A
new HLS framework for the synthesis and optimization of memory
accesses (SOMA) is presented. SOMA enables specifications to
include arbitrary memory references (e.g., pointers) and allows
the memory system to incorporate features that might cause the
latency of a memory access to vary dynamically. This results in
raising the level of abstraction in the input specification,
enabling faster design times. SOMA synthesizes a memory access
network (MAN) architecture that facilitates dynamic scheduling
and ordering of memory accesses. The paper describes a basic MAN
construction technique that illustrates how dynamic ordering
helps in efficiently maintaining memory consistency and how
dynamic scheduling helps alleviate the variable-latency problem.
Then, it is shown how static analysis of the access patterns can
be used to optimize the MAN. One optimization changes the MAN
interconnect topology to increase concurrence. A second
optimization reduces the synchronization overhead necessary to
maintain memory consistency. Postlayout experiments demonstrate
that SOMA's application-specific MAN construction significantly
improves power and performance for a range of benchmarks.}},
keywords = {Asychronous Circuits, Spatial
Computing,Phoenix,Network-on-a-chip},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tcad06.pdf},
}
|
Spatial Computing |
|
Hardware Compilation of Application-Specific Memory Access Interconnect | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
IEEE Transactions on Computer Aided Design of Integrated Circuits and Systems,
25(5):756–771, 1990.
|
| @article{venkataramani-tcad06,
title = {Hardware Compilation of Application-Specific Memory Access
Interconnect},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
journal = {IEEE Transactions on Computer Aided Design of Integrated
Circuits and Systems},
year = {2006},
volume = {25},
number = {5},
pages = {756--771},
issn = {0278-0070},
abstract = {{A major obstacle to successful high-level synthesis
(HLS) of large-scale application-specified integrated circuit
systems is the presence of memory accesses to a shared-memory
subsystem. The latency to access memory is often not statically
predictable, which creates problems for scheduling operations
dependent on memory reads. More fundamental is that dependences
between accesses may not be statically provable (e.g., if the
specification language permits pointers), which introduces
memory-consistency problems. Addressing these issues with static
scheduling results in overly conservative circuits, and thus,
most state-of-the-art HLS tools limit memory systems to those
that have predictable latencies and limit programmers to
specifications that forbid arbitrary memory-reference patterns. A
new HLS framework for the synthesis and optimization of memory
accesses (SOMA) is presented. SOMA enables specifications to
include arbitrary memory references (e.g., pointers) and allows
the memory system to incorporate features that might cause the
latency of a memory access to vary dynamically. This results in
raising the level of abstraction in the input specification,
enabling faster design times. SOMA synthesizes a memory access
network (MAN) architecture that facilitates dynamic scheduling
and ordering of memory accesses. The paper describes a basic MAN
construction technique that illustrates how dynamic ordering
helps in efficiently maintaining memory consistency and how
dynamic scheduling helps alleviate the variable-latency problem.
Then, it is shown how static analysis of the access patterns can
be used to optimize the MAN. One optimization changes the MAN
interconnect topology to increase concurrence. A second
optimization reduces the synchronization overhead necessary to
maintain memory consistency. Postlayout experiments demonstrate
that SOMA's application-specific MAN construction significantly
improves power and performance for a range of benchmarks.}},
keywords = {Asychronous Circuits, Spatial
Computing,Phoenix,Network-on-a-chip},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tcad06.pdf},
}
|
|
Leveraging Protocol Knowledge in Slack Matching | pdf bib | |
Girish Venkataramani and Seth Copen Goldstein.
In IEEE/ACM International Conference on Computer-Aided Design (ICCAD),
Nov 1990.
|
| @inproceedings{venkataramani-iccad06,
title = {Leveraging Protocol Knowledge in Slack Matching},
author = {Venkataramani, Girish and Goldstein, Seth Copen},
booktitle = {IEEE/ACM International Conference on Computer-Aided
Design (ICCAD)},
year = {2006},
address = {San Jose, CA},
month = {Nov},
abstract = {{Stalls, due to mis-matches in communication rates, are
a major performance obstacle in pipelined circuits. If the rate
of data production is faster than the rate of consumption, the
resulting design performs slower than when the communication rate
is matched. This can be remedied by inserting pipeline buffers
(to temporarily hold data), allowing the producer to proceed if
the consumer is not ready to accept data. The problem of deciding
which channels need these buffers (and how many) for an arbitrary
communication profile is called the slack matching problem; the
optimal solution to this problem has been shown to be
NP-complete. \par In this paper, we present a heuristic that uses
knowledge of the communication protocol to explicitly model these
bottlenecks, and an iterative algorithm to progressively remove
these bottlenecks by inserting buffers. We apply this algorithm
to asynchronous circuits, and show that it naturally handles
large designs with arbitrarily cyclic and acyclic topologies,
which exhibit various types of control choice. The heuristic is
efficient, achieving linear time complexity in practice, and
produces solutions that (a) achieve up to 60\% performance
speedup on large media processing kernels, and (b) can either be
verified to be optimal, or the approximation margin can be
bounded. }},
keywords = {Asychronous Circuits, Spatial Computing, CAD, Global
Critical Path},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iccad06.pdf},
}
|
|
Modeling the Global Critical Path in Concurrent Systems | pdf bib | |
Girish Venkataramani, Tiberiu Chelcea, Mihai Budiu, and Seth Copen Goldstein.
Carnegie Mellon University Technical Report No. CMU-CS-06-144,
Aug 1990.
|
| @techreport{venkataramani-tr06,
author = {Venkataramani, Girish and Chelcea, Tiberiu and Budiu,
Mihai and Goldstein, Seth Copen},
title = {Modeling the Global Critical Path in Concurrent Systems},
institution = {Carnegie Mellon University},
year = {2006},
number = {CMU-CS-06-144},
month = {Aug},
abstract = {We show how the global critical path can be used as a
practical tool for understanding, optimizing and summarizing the
behavior of highly concurrent self-timed circuits. Traditionally,
critical path analysis has been applied to DAGs, and thus was
constrained to combinatorial sub-circuits. We formally define the
global critical path (GCP) and show how it can be constructed
using only local information that is automatically derived
directly from the circuit. We introduce a form of Production
Rules, which can accurately determine the GCP for a given input
vector, even for modules which exhibit choice and early
termination. \par The GCP provides valuable insight into the
control behavior of the application, which help in formulating
new optimizations and re-formulating existing ones to use the GCP
knowledge. We have constructed a fully automated framework for
GCP detection and analysis, and have incorporated this framework
into a high-level synthesis tool-chain. We demonstrate the
effectiveness of the GCP framework by re-formulating two
traditional CAD optimizations to use the GCP, yielding efficient
algorithms which improve circuit power (by up to 9\%) and
performance (by up to 60\%) in our experiments.},
keywords = {Asychronous Circuits, Spatial Computing,CAD, Global
Critical Path},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-tr06.pdf},
}
|
|
Tartan: Evaluating Spatial Computation for Whole Program Execution | pdf bib | |
Mahim Mishra, Timothy J Callahan, Tiberiu Chelcea, Girish Venkataramani, Mihai Budiu, and Seth Copen Goldstein.
In 12th ACM International Conference on Architecture Support for Programming Languages and Operating Systems (ASPLOS),
pages 163–174, Oct 1990.
|
| @inproceedings{mahim-asplos06,
title = {Tartan: Evaluating Spatial Computation for Whole Program
Execution},
author = {Mishra, Mahim and Callahan, Timothy J and Chelcea, Tiberiu
and Venkataramani, Girish and Budiu, Mihai and Goldstein, Seth
Copen},
booktitle = {12th ACM International Conference on Architecture
Support for Programming Languages and Operating Systems
(ASPLOS)},
year = {2006},
pages = {163--174},
address = {San Jose, CA},
month = {Oct},
abstract = {Spatial Computing (SC) has been shown to be an
energy-efficient model for implementing program kernels. In this
paper we explore the feasibility of using SC for more than small
kernels. To this end, we evaluate the performance and energy
efficiency of entire applications on Tartan, a general-purpose
architecture which integrates a reconfigurable fabric (RF) with a
superscalar core. Our compiler automatically partitions and
compiles an application into an instruction stream for the core
and a configuration for the RF. We use a detailed simulator to
capture both timing and energy numbers for all parts of the
system. \par Our results indicate that a hierarchical RF
architecture, designed around a scalable interconnect, is
instrumental in harnessing the benefits of spatial computation.
The interconnect uses static configuration and routing at the
lower levels and a packet-switched, dynamically-routed network at
the top level. Tartan is most energy-efficient when almost all of
the application is mapped to the RF, indicating the need for the
RF to support most general-purpose programming constructs. Our
initial investigation reveals that such a system can provide, on
average, an order of magnitude improvement in energy-delay
compared to an aggressive superscalar core on single-threaded
workloads.},
keywords = {Asychronous Circuits, Spatial Computing, Reconfigurable
Computing,Phoenix, Tartan},
url = {http://www.cs.cmu.edu/~seth/papers/mahim-asplos06.pdf},
}
|
|
Dataflow: A Complement to Superscalar | pdf bib | |
Mihai Budiu, Pedro V. Artigas, and Seth Copen Goldstein.
In IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS),
pages 177–186, Mar 1990.
|
| @inproceedings{budiu-ispass05,
author = {Budiu, Mihai and Artigas, Pedro V. and Goldstein, Seth
Copen},
title = {Dataflow: A Complement to Superscalar},
booktitle = {IEEE International Symposium on Performance Analysis of
Systems and Software (ISPASS)},
month = {Mar},
year = {2005},
pages = {177--186},
address = {Austin, TX},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-ispass05.pdf},
abstract = {There has been a resurgence of interest in dataflow
architectures, because of their potential for exploiting
parallelism with low overhead. In this paper we analyze the
performance of a class of static dataflow machines on integer
media and control-intensive programs and we explain why a
dataflow machine, even with unlimited resources, does not always
outperform a superscalar processor on general-purpose codes,
under the assumption that both machines take the same time to
execute basic operations. We compare a program-specific dataflow
machine with unlimited parallelism to a superscalar processor
running the same program. While the dataflow machines provide
very good performance on most data-parallel programs, we show
that the dataflow machine cannot always take advantage of the
available parallelism. Using the dynamic critical path we
investigate the mechanisms used by superscalar processors to
provide a performance advantage and their impact on a dataflow
model.},
confweb = {http://www.ispass.org/ispass2005},
keywords = {Spatial Computing,Phoenix},
}
|
|
SOMA: A Tool for Synthesizing and Optimizing Memory Accesses in ASICs | pdf bib | |
Girish Venkataramani, Tobias Bjerregaard, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE/ACM/IFIP International Conference on Hardware/Software Codesign and System Synthesis (CODES-ISSS),
pages 231–236, Sep 1990.
|
| @inproceedings{venkataramani-isss05,
title = {SOMA: A Tool for Synthesizing and Optimizing Memory
Accesses in ASICs},
author = {Venkataramani, Girish and Bjerregaard, Tobias and Chelcea,
Tiberiu and Goldstein, Seth Copen},
booktitle = {IEEE/ACM/IFIP International Conference on
Hardware/Software Codesign and System Synthesis (CODES-ISSS)},
year = {2005},
isbn = {1-59593-161-9},
pages = {231-236},
address = {Jersey City, NJ, USA},
month = {Sep},
abstract = {Arbitrary memory dependencies and variable latency
memory systems are major obstacles to the synthesis of
large-scale ASIC systems in high-level synthesis. This paper
presents SOMA, a synthesis framework for constructing Memory
Access Network (MAN) architectures that inherently enforce memory
consistency in the presence of dynamic memory access
dependencies. A fundamental bottleneck in any such network is
arbitrating between concurrent accesses to a shared memory
resource. To alleviate this bottleneck, SOMA uses an
application-specific concurrency analysis technique to predict
the dynamic memory parallelism profile of the application. This
is then used to customize the MAN architecture. Depending on the
parallelism profile, the MAN may be optimized for latency,
throughput or both. The optimized MAN is automatically
synthesized into gate-level structural Verilog using a flexible
library of network building blocks. SOMA has been successfully
integrated into an automated C-to-hardware synthesis flow, which
generates standard cell circuits from unrestricted ANSI-C
programs. Post-layout experiments demonstrate that application
specific MAN construction significantly improves power and
performance.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix,
CAD,Compilers:Memory Optimizations},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-isss05.pdf},
}
|
|
HLS Support for Unconstrained Memory Accesses | pdf bib | |
Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE 14th International Workshop on Logic Synthesis (IWLS),
Jun 1990.
|
| @inproceedings{venkataramani-iwls05,
title = {{HLS} Support for Unconstrained Memory Accesses},
author = {Venkataramani, Girish and Chelcea, Tiberiu and Goldstein,
Seth Copen},
booktitle = {IEEE 14th International Workshop on Logic Synthesis
(IWLS)},
year = {2005},
address = {Lake Arrowhead, CA},
month = {Jun},
abstract = {A major obstacle in high-level synthesis (HLS) of
large-scale ASIC systems is memory access patterns. Typically,
most state-of-the-art HLS tools impose constraints on the memory
references in the source application, requiring them to exhibit
predictable access patterns, and/or requiring dependencies
between them to be statically determinable. This paper addresses
the HLS problem when such constraints are relaxed. We present an
analysis infrastructure that can be used within any HLS toolflow
for synthesizing circuits from high-level abstractions, such as
ANSI-C, where no assumptions can be made about memory access
latencies, and where dependencies between memory references can
only be disambiguated dynamically at runtime (pointer aliasing).
We start by describing a generic framework to build a
dependence-aware, fully distributed, although often conservative,
memory-access network (MAN) for a given memory-dependence graph.
Then, we propose a suite of optimizations to customize the MAN
for the given specification. All these techniques guarantee
memory coherency. Experimental results on Mediabench benchmarks,
show that such an approach succeeds in maintaining high levels of
parallelism, while ensuring memory coherency. The optimizations
succeed in lowering the synchronization overhead by as much as
4x.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls05.pdf},
}
|
|
Spatial Computation | pdf bib | |
Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS),
pages 14–26, Oct 1990.
|
| @inproceedings{budiu-asplos04,
author = {Budiu, Mihai and Venkataramani, Girish and Chelcea,
Tiberiu and Goldstein, Seth Copen},
title = {Spatial Computation},
booktitle = {International Conference on Architectural Support for
Programming Languages and Operating Systems (ASPLOS)},
pages = {14--26},
month = {Oct},
address = {Boston, MA},
year = {2004},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-asplos04.pdf},
abstract = {This paper describes a computer architecture that relies
on the direct translation of high-level language programs into
{\em Spatial Computation} (SC) hardware structures. SC program
implementations are completely distributed, without any
centralized control. SC circuits are optimized for {\em wires} at
the expense of computation units. \par In this paper we
investigate a particular implementation SC structures called ASH
(Application-Specific Hardware). Under the assumption that
computation is cheaper than communication, ASH replicates
computation units to simplify interconnect, building a system
which uses very simple, completely dedicated communication
channels. As a consequence, communication on the datapath never
requires arbitration; the only arbitration required is for
accessing memory. ASH relies on very simple hardware primitives,
using no associative structures, no multiported register files,
no scheduling logic, no broadcast, and no clocks. As a
consequence, ASH hardware is fast and extremely power efficient.
\par In this work we demonstrate three features of ASH: (1) that
such architectures can be built by automatic compilation of C
programs, (2) that distributed computation is in some respects
fundamentally different from monolithic superscalar processors
and (3) that ASIC implementations of ASH use 3 orders of
magnitude less energy compared to high-end superscalar
processors, while being within a factor of two in performance.},
keywords = {Asychronous Circuits, Spatial Computing,Phoenix},
}
|
|
Translating ANSI C to Asynchronous Circuits | pdf bib | |
Mihai Budiu, Girish Venkataramani, Tiberiu Chelcea, and Seth Copen Goldstein.
In 10th IEEE International Symposium on Asynchronous Circuits and Systems (ASYNC '04),
Apr 1990.
|
| @inproceedings{budiu-async04,
title = {Translating ANSI C to Asynchronous Circuits},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-async04.pdf},
booktitle = {10th IEEE International Symposium on Asynchronous
Circuits and Systems (ASYNC '04)},
author = {Budiu, Mihai and Venkataramani, Girish and Chelcea,
Tiberiu and Goldstein, Seth Copen},
address = {Crete, Greece},
year = {2004},
month = {Apr},
keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault
and Defect Tolerance,Phoenix,Reconfigurable Computing,Spatial
Computing},
}
|
|
C to Asynchronous Dataflow Circuits: An End-to-End Toolflow | pdf bib | |
Girish Venkataramani, Mihai Budiu, Tiberiu Chelcea, and Seth Copen Goldstein.
In IEEE 13th International Workshop on Logic Synthesis (IWLS),
Jun 1990.
|
| @inproceedings{venkataramani-iwls04,
title = {{C} to Asynchronous Dataflow Circuits: An End-to-End
Toolflow},
author = {Venkataramani, Girish and Budiu, Mihai and Chelcea,
Tiberiu and Goldstein, Seth Copen},
booktitle = {IEEE 13th International Workshop on Logic Synthesis
(IWLS)},
address = {Temecula, CA},
month = {Jun},
year = {2004},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-iwls04.pdf},
abstract = {We present a complete toolflow that translates ANSI-C
programs into asynchronous circuits. The toolflow is built around
a compiler that converts C into a functional dataflow
intermediate representation, exposing instruction-level, pipeline
and memory parallelism. The compiler performs optimizations and
converts the intermediate representation into pipelined
asynchronous circuits, with no centralized controllers. In the
resulting circuits, control is distributed, communication is
achieved through local wires, and arbitration for datapath
resources is unnecessary. Circuits automatically synthesized from
Mediabench kernels exhibit substantially better energy-delay than
either single-issue processors or aggressive superscalar cores.},
keywords = {Asychronous Circuits,Spatial Computing,Phoenix,CAD},
}
|
|
Defect Tolerance After the Roadmap | pdf bib | |
Mahim Mishra and Seth Copen Goldstein.
In Proceedings of the 10th International Test Synthesis Workshop (ITSW),
Mar 1990.
|
| @inproceedings{mishra-itsw03,
author = {Mishra, Mahim and Goldstein, Seth Copen},
title = {Defect Tolerance After the Roadmap},
booktitle = {Proceedings of the 10th International Test Synthesis
Workshop (ITSW)},
month = {Mar},
year = {2003},
address = {Santa Barbara, {CA}},
keywords = {Spatial Computing, Reconfigurable Computing,Phoenix,
Fault and Defect Tolerance},
url = {http://www.cs.cmu.edu/~seth/papers/mishra-itsw03.pdf},
}
|
|
Defect Tolerance at the End of the Roadmap | pdf bib | |
Mahim Mishra and Seth Copen Goldstein.
In Proceedings of the International Test Conference (ITC), 2003,
Sep 1990.
|
| @inproceedings{mishra-itc03,
author = {Mishra, Mahim and Goldstein, Seth Copen},
title = {Defect Tolerance at the End of the Roadmap},
booktitle = {Proceedings of the International Test Conference
({ITC}), 2003},
month = {Sep},
year = {2003},
address = {Charlotte, {NC}},
url = {http://www.cs.cmu.edu/~seth/papers/mishra-itc03.pdf},
abstract = {Defect tolerance will become more important as feature
sizes shrink closer to single digit nanometer dimensions. This is
true whether the chips are manufactured using top-down methods
(e.g., photolithography) or bottom-up methods (e.g., chemically
assembled electronic nanotechnology, or CAEN). In this paper, we
propose a defect tolerance methodology centered around
reconfigurable devices, a scalable testing method, and dynamic
place-and-route. Our methodology is particularly well suited for
CAEN.},
keywords = {Spatial Computing, Reconfigurable
Computing,Phoenix,Fault and Defect Tolerance},
}
|
|
Molecules, Gates, Circuits, Computer | pdf bib | |
Seth Copen Goldstein and Mihai Budiu.
In Molecular Nanoelectronics,
Jan 1990.
|
| @incollection{goldstein-mn03,
title = {Molecules, Gates, Circuits, Computer},
url = {http://www.cs.cmu.edu/~seth/papers/goldstein-mn03.pdf},
booktitle = {Molecular Nanoelectronics},
author = {Goldstein, Seth Copen and Budiu, Mihai},
year = {2003},
editor = {Mark A. Reed and Takhee Lee},
publisher = {American Scientific Publishers},
address = {Stevenson Ranch, CA},
month = {Jan},
isbn = {1-588883-006-3},
keywords = {Asychronous Circuits,CAD,Electronic Nanotechnology,Fault
and Defect Tolerance,Reconfigurable Computing,Spatial
Computing,electronic nanotechnology,molecular electronics},
}
|
|
Optimizing Memory Accesses For Spatial Computation | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
In Proceedings of the 1st International ACM/IEEE Symposium on Code Generation and Optimization (CGO 03),
pages 216–227, Mar 1990.
|
| @inproceedings{budiu-cgo03,
title = {Optimizing Memory Accesses For Spatial Computation},
author = {Budiu, Mihai and Goldstein, Seth Copen},
booktitle = {Proceedings of the 1st International ACM/IEEE Symposium
on Code Generation and Optimization (CGO 03)},
year = {2003},
address = {San Francisco, CA},
month = {Mar},
pages = {216-227},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-cgo03.pdf},
keywords = {Spatial Computing, Reconfigurable
Computing,Phoenix,Compilers:Memory Optimizations},
}
|
|
Compiling Application-Specific Hardware | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
In Proceedings of the 12th International Conference on Field Programmable Logic and Applications,
pages 853–863, Sep 1990.
|
| @inproceedings{budiu-fpl02,
author = {Budiu, Mihai and Goldstein, Seth Copen},
title = {Compiling Application-Specific Hardware},
booktitle = {Proceedings of the 12th International Conference on
Field Programmable Logic and Applications},
year = {2002},
address = {Montpellier (La Grande-Motte), France},
month = {Sep},
pages = {853--863},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-fpl02.pdf},
abstract = {In this paper we describe ASH, an architectural
framework for implementing Application-Specific Hardware. ASH is
based on automatic hardware synthesis from high-level languages.
The generated circuits use only localized computation structures;
in consequence, we expect these circuits to be fast, to use
little power and to scale well with program complexity. \par We
present in detail CASH, a scalable compiler framework for ASH,
which generates hardware from programs written in C. Our compiler
exploits instruction level parallelism by using aggressive
speculation and dynamic scheduling. Based on this compilation
scheme, we evaluate the computational resources necessary for
implementing complex integer-based programs, and we suggest
architectural features that would support the ASH framework.},
keywords = {Spatial Computing,Phoenix,Compilers:CASH},
}
|
|
Factors Influencing the Performance of a CPU-RFU Hybrid Architecture | pdf bib | |
Girish Venkataramani, Suraj Sudhir, Mihai Budiu, and Seth Copen Goldstein.
In Proceedings of the 12th International Conference on Field Programmable Logic and Applications (FPL),
pages 955–965, Sep 1990.
|
| @inproceedings{venkataramani-fpl02,
title = {Factors Influencing the Performance of a CPU-RFU Hybrid
Architecture},
author = {Venkataramani, Girish and Sudhir, Suraj and Budiu, Mihai
and Goldstein, Seth Copen},
booktitle = {Proceedings of the 12th International Conference on
Field Programmable Logic and Applications (FPL)},
year = {2002},
address = {Montpellier (La Grande-Motte), France},
month = {Sep},
url = {http://www.cs.cmu.edu/~seth/papers/venkataramani-fpl02.pdf},
abstract = {Closely coupling a reconfigurable fabric with a
conventional processor has been shown to successfully improve the
system performance. However, today s superscalar pro-cessors are
both complex and adept at extracting Instruction Level
Parallelism (ILP), which introduces many complex issues to the
design of a hybrid CPU-RFU system. This paper examines the design
of a superscalar processor augmented with a closely-coupled
recon-figurable fabric. It identifies architectural and compiler
issues that affect the performance of the overall system.
Previous efforts at combining a processor core with a
reconfigurable fabric are examined in the light of these issues.
We also present simulation results that emphasize the impact of
these factors.},
pages = {955-965},
isbn = {3-540-44108-5},
publisher = {Springer-Verlag},
keywords = {Spatial Computing,Reconfigurable Computing,Phoenix},
}
|
|
Pegasus: An Efficient Intermediate Representation | pdf bib | |
Mihai Budiu and Seth Copen Goldstein.
Carnegie Mellon University Technical Report No. CMU-CS-02-107,
pages 20, May 1990.
|
| @techreport{budiu-tr02,
author = {Budiu, Mihai and Goldstein, Seth Copen},
title = {Pegasus: An Efficient Intermediate Representation},
institution = {Carnegie Mellon University},
year = {2002},
number = {CMU-CS-02-107},
month = {May},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-tr02.pdf},
pages = {20},
abstract = {We present Pegasus, a compact and expressive
intermediate representation for imperative languages. The
representation is suitable for target architectures supporting
predicated execution and aggressive speculation. In Pegasus
information about the global dataflow of the program is encoded
in local structures, enabling compact and efficient algorithms
for program optimizations. As a proof of the versatility of
Pegasus, we have used it in a compiler translating C programs to
hardware implementations.},
keywords = {Spatial Computing, Reconfigurable Computing,Phoenix},
}
|
|
NanoFabrics: Spatial Computing Using Molecular Electronics | pdf bib | |
Seth Copen Goldstein and Mihai Budiu.
In Proceedings of the 28th International Symposium on Computer Architecture (ISCA),
pages 178–189, Jul 1990.
|
| @inproceedings{goldstein-isca01,
author = {Goldstein, Seth Copen and Budiu, Mihai},
title = {{NanoFabrics}: Spatial Computing Using Molecular
Electronics},
booktitle = {Proceedings of the 28th International Symposium on
Computer Architecture (ISCA)},
month = {Jul},
address = {{G\"{o}teborg, Sweden}},
year = {2001},
pages = {178--189},
abstract = {The continuation of the remarkable exponential increases
in processing power over the recent past faces imminent
challenges due in part to the physics of deep-submicron CMOS
devices and the costs of both chip masks and future fabrication
plants. A promising solution to these problems is offered by an
alternative to CMOS-based computing, chemically assembled
electronic nanotechnology (CAEN). In this paper we outline how
CAEN based computing can become a reality. We briefly describe
recent work in CAEN and how CAEN will affect computer
architecture. We show how the inherently reconfigurable natures
of CAEN devices can be exploited to provide high-density chips
with defect tolerance which will significantly reduce the cost of
manufacturing. After developing the basic building blocks of a
CAEN based computing devices we present some preliminary results
which indicate that CAEN based computing devices can meet or
exceed the performance of CMOS based devices.},
url = {http://www.cs.cmu.edu/~seth/papers/goldstein-isca01.pdf},
keywords = {Spatial Computing, Reconfigurable Computing,Phoenix,
Electronic Nanotechnology},
}
|
|
BitValue Inference: Detecting and Exploiting Narrow Bitwidth Computations | pdf bib | |
Mihai Budiu, Majd Sakr, Kevin Walker, and Seth Copen Goldstein.
In Proceedings of the 2000 Europar Conference,
volume 1900, pages 969–979, Aug 1990.
Also appeared as CMU CS Technical Report, CMU-CS-00-141, October 2000..
|
| @inproceedings{budiu-europar00,
title = {{BitValue} Inference: Detecting and Exploiting Narrow
Bitwidth Computations},
author = {Budiu, Mihai and Sakr, Majd and Walker, Kevin and
Goldstein, Seth Copen},
booktitle = {Proceedings of the 2000 Europar Conference},
year = {2000},
volume = {1900},
pages = {969--979},
month = {Aug},
issn = {0302-9743},
series = {Lecture Notes in Computer Science},
publisher = {Springer Verlag},
address = {Munich, Germany},
url = {http://www.cs.cmu.edu/~seth/papers/budiu-europar00.pdf},
also = {CMU CS Technical Report, CMU-CS-00-141, October 2000.},
abstract = {We present a compiler algorithm called BitValue, which
can discover both unused and constant bits in dusty-deck C
programs. BitValue uses forward and backward dataflow analyses,
generalizing constant-folding and dead-code detection at the
bit-level. This algorithm enables compiler optimizations which
target special processor architectures for computing on
non-standard bitwidths. Using this algorithm we show that up to
31\% of the computed bytes are thrown away (for programs from
SpecINT95 and Mediabench). A compiler for reconfigurable hardware
uses this algorithm to achieve substantial reductions (up to
20-fold) in the size of the synthesized circuits.},
keywords = {Spatial Computing,Reconfigurable
Computing,Phoenix,PipeRench,CAD},
}
|
Back to publications list
|