Publications

@PhdThesis{Scholz96,
  author   = {Sven-Bodo Scholz},
  title    = {Single Assignment C -- Entwurf Und Implementierung Einer Funktionalen C-variante Mit Spezieller Unterstützung Shape-invarianter Array-operationen},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  note     = {Shaker Verlag, Aachen, 1997},
  category = {design,core,types,apps},
  summary  = {Sven-Bodo's PhD thesis is the most comprehensive document about SAC. The language design is described in detail. Moreover, a formal semantics of SAC is given. Further points of emphasis are the type system and the compilation process. Multigrid relaxation is used as a case study to illustrate programming with SAC.},
  topics   = {SAC,Language Manuals,Implementation of Arrays,Language Design},
  url      = {sac-design-sbs-phd-96.pdf},
}

@MastersThesis{Grelck96,
  author   = {Clemens Grelck},
  title    = {Integration Eines Modul- Und Klassen-konzeptes in Die Funktionale Programmiersprache Sac -- Single Assignment C},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  category = {design,core,modules},
  summary  = {Clemens' Diploma thesis is a detailed description of the module system, the integration of states and state modifications including the realization of powerful I/O facilities, and the foreign language interface which allows to use functions implemented in C within SAC programs. Both language design and implementation issues are covered.},
  topics   = {SAC,Uniqueness Types,Language Interfacing},
  url      = {sac2c-modules-classes-dipl-cg-96.pdf},
}

1995

sac-classes-objects-bastad-95.pdf BibTexAbstract

@InProceedings{GrelSchoIFL95,
  author    = {Grelck, Clemens and Scholz, Sven-Bodo},
  title     = {Classes and Objects As Basis for I/o in Sac},
  booktitle = {7th International Workshop on Implementation of Functional Languages (IFL'95), B{\aa}stad, Sweden},
  year      = {1995},
  editor    = {Thomas Johnsson},
  pages     = {30--44},
  publisher = {Chalmers University of Technology, Gothenburg, Sweden},
  abstract  = {In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce
and manipulate states of arbitrary data-structures within Sac.},
  category  = {design,core,states},
  summary   = {This paper describes the way states and state modifications are incorporated into the purely functional world of SAC based on uniqueness typing and an extension of the module system. The integration of I/O facilities is taken as a case study. The paper is the best English introduction to this part of SAC.},
  topics    = {SAC,Uniqueness Types},
  url       = {sac-classes-objects-bastad-95.pdf},
}

In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce and manipulate states of arbitrary data-structures within Sac.

1994

@InProceedings{ScholzIFL94,
  author    = {Sven-Bodo Scholz},
  title     = {Single-assignment C --- Functional Programming Using Imperative Style},
  booktitle = {6th International Workshop on Implementation of Functional Languages (IFL'94), Norwich, England, UK},
  year      = {1994},
  editor    = {John Glauert},
  pages     = {211--2113},
  publisher = {University of East Anglia, Norwich, England, UK},
  category  = {design,core,types},
  summary   = {This is the first paper to propose SAC. The basic language design ideas are explained and its syntax is defined. However, SAC has changed a lot since then. So, this classic paper can no longer be recommended as an introductory source of information about SAC.},
  topics    = {SAC,Implementation of Arrays,Language Design},
  url       = {sac-overview-norwich-94.pdf},
}

2019

DOItensor-comprehensions-ifl19.pdf BibTexAbstract

@inproceedings{SchoShinIFL19,
 author = {Sven-Bodo Scholz and Artjoms Šinkarovs},
 title = {Tensor Comprehensions in {SaC}},
 year = {2019},
 isbn = {9781450375627},
 publisher = {ACM},
 address = {New York, NY, USA},
 booktitle = {Proceedings of the 31st Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '19},
 articleno = {15},
 numpages = {13},
 doi = {10.1145/3412932.3412947},
 url = {tensor-comprehensions-IFL19.pdf},
 summary = {Motivation,syntax, and semantics of Tensor Comprehensions in SaC.},
 abstract = {We propose a new notation for data parallel operators on multi- dimensional arrays named tensor comprehensions. This notation combines the basic principle of array-comprehensions with syn- tactical shortcuts very close to those found in the so-called Tensor Notations used in Physics and Mathematics. As a result, complex operators with rich semantics can be defined concisely. The key to this conciseness lies in the ability to define shape-polymorphic operations combined with the ability to infer array shapes from the immediate context. The paper provides a definition of the pro- posed notation, a formal shape inference process, as well as a set of re-write rules that translates tensor comprehensions as a zero-cost syntactic sugar into standard SaC expressions.},
 keywords = {Array comprehensions,
 Call by value, Array programming, Functional languages},
 category  = {core,design},
 location = {Singapore}
}

We propose a new notation for data parallel operators on multi- dimensional arrays named tensor comprehensions. This notation combines the basic principle of array-comprehensions with syn- tactical shortcuts very close to those found in the so-called Tensor Notations used in Physics and Mathematics. As a result, complex operators with rich semantics can be defined concisely. The key to this conciseness lies in the ability to define shape-polymorphic operations combined with the ability to infer array shapes from the immediate context. The paper provides a definition of the pro- posed notation, a formal shape inference process, as well as a set of re-write rules that translates tensor comprehensions as a zero-cost syntactic sugar into standard SaC expressions.

2012

Single Assignment C (sac): High Productivity Meets High Performance C. Grelck (2012). In 4th Central European Functional Programming Summer School (CEFP'11), Budapest, Hungary. pp. 207–278. Springer. DOI2012_1.pdf BibTexAbstract

@InProceedings{GrelckCEFP11,
  author    = {C. Grelck},
  title     = {Single Assignment C (sac): High Productivity Meets High Performance},
  booktitle = {4th Central European Functional Programming Summer School (CEFP'11), Budapest, Hungary},
  year      = {2012},
  editor    = {V. Zs\'ok and Z. Horv\'ath and R. Plasmeijer},
  volume    = {7241},
  series    = {Lecture Notes in Computer Science},
  pages     = {207--278},
  publisher = {Springer},
  abstract  = {We present the ins and outs of the purely functional, data parallel programming language SaC (Single Assignment C). SaC defines state- and side-effect-free semantics on top of a syntax resembling that of imperative languages like C/C++/C# or Java: functional programming with curly brackets. In contrast to other functional languages data aggregation in SaC is not based on lists and trees, but puts stateless arrays into the focus. SaC implements an abstract calculus of truly multidimensional arrays that is adopted from interpreted array languages like Apl. Arrays are abstract values with certain structural properties. They are treated in a holistic way, not as loose collections of data cells or indexed memory address ranges. Programs can and should be written in a mostly index-free style. Functions consume array values as arguments and produce array values as results. The array type system of SaC allows such functions to abstract not only from the size of vectors or matrices but likewise from the number of array dimensions, supporting a highly generic programming style. The design of SaC aims at reconciling high productivity in software engineering of compute-intensive applications with high performance in program execution on modern multi- and many-core computing systems. While SaC competes with other functional and declarative languages on the productivity aspect, it competes with hand-parallelised C and Fortran code on the performance aspect. We achieve our goal through stringent co-design of programming language and compilation technology.  The focus on arrays in general and the abstract view of arrays in particular combined with a functional state-free semantics are key ingredients in the design of SaC. In conjunction they allow for far-reaching program transformations and fully compiler-directed parallelisation. From literally the same source code SaC currently supports symmetric multi-socket, multi-core, hyperthreaded server systems, CUDA-enables graphics accelerators and the MicroGrid, an innovative general-purpose many-core architecture. The CEFP lecture provides an introduction into the language design of SaC, followed by an illustration of how these concepts can be harnessed to write highly abstract, reusable and elegant code. We conclude with outlining the major compiler technologies for achieving runtime performance levels that are competitive with low-level machine-oriented programming environments.},
  category  = {design,core},
  doi       = {10.1007/978-3-642-32096-5_5},
  topics    = {SAC},
  url       = {2012_1.pdf},
}

We present the ins and outs of the purely functional, data parallel programming language SaC (Single Assignment C). SaC defines state- and side-effect-free semantics on top of a syntax resembling that of imperative languages like C/C++/C or Java: functional programming with curly brackets. In contrast to other functional languages data aggregation in SaC is not based on lists and trees, but puts stateless arrays into the focus. SaC implements an abstract calculus of truly multidimensional arrays that is adopted from interpreted array languages like Apl. Arrays are abstract values with certain structural properties. They are treated in a holistic way, not as loose collections of data cells or indexed memory address ranges. Programs can and should be written in a mostly index-free style. Functions consume array values as arguments and produce array values as results. The array type system of SaC allows such functions to abstract not only from the size of vectors or matrices but likewise from the number of array dimensions, supporting a highly generic programming style. The design of SaC aims at reconciling high productivity in software engineering of compute-intensive applications with high performance in program execution on modern multi- and many-core computing systems. While SaC competes with other functional and declarative languages on the productivity aspect, it competes with hand-parallelised C and Fortran code on the performance aspect. We achieve our goal through stringent co-design of programming language and compilation technology. The focus on arrays in general and the abstract view of arrays in particular combined with a functional state-free semantics are key ingredients in the design of SaC. In conjunction they allow for far-reaching program transformations and fully compiler-directed parallelisation. From literally the same source code SaC currently supports symmetric multi-socket, multi-core, hyperthreaded server systems, CUDA-enables graphics accelerators and the MicroGrid, an innovative general-purpose many-core architecture. The CEFP lecture provides an introduction into the language design of SaC, followed by an illustration of how these concepts can be harnessed to write highly abstract, reusable and elegant code. We conclude with outlining the major compiler technologies for achieving runtime performance levels that are competitive with low-level machine-oriented programming environments.

2011

Nested Arrays in Single Assignment C R. Douma (2011). Amsterdam, Netherlands. 2011_0.pdf BibTexAbstract

@MastersThesis{Douma11,
  author    = {R. Douma},
  title     = {Nested Arrays in Single Assignment C},
  school    = {University of Amsterdam},
  year      = {2011},
  address   = {Amsterdam, Netherlands},
  topics    = {SAC},
  summary   = {},
  abstract  = {In many languages when one talks about arrays this means rectangular arrays.  However, there are many problems that are not rectangular and it is often nonlogical to describe them  using  rectangular  arrays.   We  will  call  these  non  rectangular  arrays: irregular arrays. In  this  thesis  we  introduce  an  implementation  for  irregular  arrays,  called  nested arrays,  in  Single  Assignment  C  (or SaC for  short).   We  discuss  the  design  space  of irregular arrays and extend the SaC language, and the SaC compiler to support irregular arrays. As a result we show that for SaC programs that have irregular data the use of nested arrays can lead to a signi cant reduction in memory requirements,  while at the same time providing substantial speedup.},
  category  = {core, design},
  DOI       = {},
  URL       = {2011_0.pdf},
}

In many languages when one talks about arrays this means rectangular arrays. However, there are many problems that are not rectangular and it is often nonlogical to describe them using rectangular arrays. We will call these non rectangular arrays: irregular arrays. In this thesis we introduce an implementation for irregular arrays, called nested arrays, in Single Assignment C (or SaC for short). We discuss the design space of irregular arrays and extend the SaC language, and the SaC compiler to support irregular arrays. As a result we show that for SaC programs that have irregular data the use of nested arrays can lead to a signi cant reduction in memory requirements, while at the same time providing substantial speedup.

Harnessing the Power of Gpus without Losing Abstractions in Sac and Arrayol: A Comparative Study Jing Guo, Antonio Wendell O Rodrigues, Jeyarajan Thiyagalingam, Frederic Guyomarc'h, Pierre Boulet and Sven-Bodo Scholz (2011). In 16th International Workshop on High-Level Parallel Programming Models and Supportive Environments (HIPS'11), Anchorage, Alaska, USA. IEEE Xplore. DOI2011_4.pdf BibTexAbstract

@InProceedings{GuoWendJeyaetalHIPS11,
  author    = {Jing Guo and Antonio Wendell O Rodrigues and Jeyarajan Thiyagalingam and Frederic Guyomarc'h and Pierre Boulet and Sven-Bodo Scholz},
  title     = {Harnessing the Power of Gpus without Losing Abstractions in Sac and Arrayol: A Comparative Study},
  booktitle = {16th International Workshop on High-Level Parallel Programming Models and Supportive Environments (HIPS'11), Anchorage, Alaska, USA},
  year      = {2011},
  editor    = {Thorsten Hoefler},
  publisher = {IEEE Xplore},
  abstract  = {Over recent years, using Graphics Processing Units (GPUs) has become as an effective method for increasing the performance of many applications. However, these performance benefits from GPUs come at a price. Firstly extensive programming expertise and intimate knowledge of the underlying hardware are essential for gaining good speedups. Secondly, the expressibility of GPU-based programs are not powerful enough to retain the high-level abstractions of the solutions. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices while still retaining the programming abstractions. To this end, performing a source-to-source transformation, whereby a high-level language is mapped to CUDA or OpenCL, is an attractive option. In particular, it enables one to retain high-level abstractions and to harness the power of GPUs without any expertise on the GPGPU programming. In this paper, we compare and analyse two such schemes. One of them is a transformation mechanism for mapping a image/signal processing domain-specific language, ArrayOL, to OpenCL. The other one is a transformation route for mapping a high-level general purpose array processing language, Single Assignment C (SaC) to CUDA. Using a real-world image processing application as a running example, we demonstrate that albeit the fact of being general purpose, the array processing language be used to specify complex array access patterns generically. Performance of the generated CUDA code is comparable to the OpenCL code created from domain-specific language.},
  affil     = {ctca},
  category  = {apps,design},
  doi       = {10.1109/IPDPS.2011.276},
  topics    = {SAC},
  url       = {2011_4.pdf},
}

Over recent years, using Graphics Processing Units (GPUs) has become as an effective method for increasing the performance of many applications. However, these performance benefits from GPUs come at a price. Firstly extensive programming expertise and intimate knowledge of the underlying hardware are essential for gaining good speedups. Secondly, the expressibility of GPU-based programs are not powerful enough to retain the high-level abstractions of the solutions. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices while still retaining the programming abstractions. To this end, performing a source-to-source transformation, whereby a high-level language is mapped to CUDA or OpenCL, is an attractive option. In particular, it enables one to retain high-level abstractions and to harness the power of GPUs without any expertise on the GPGPU programming. In this paper, we compare and analyse two such schemes. One of them is a transformation mechanism for mapping a image/signal processing domain-specific language, ArrayOL, to OpenCL. The other one is a transformation route for mapping a high-level general purpose array processing language, Single Assignment C (SaC) to CUDA. Using a real-world image processing application as a running example, we demonstrate that albeit the fact of being general purpose, the array processing language be used to specify complex array access patterns generically. Performance of the generated CUDA code is comparable to the OpenCL code created from domain-specific language.

2009

Compiling the Functional Data-parallel Language Sac for Microgrids of Self-adaptive Virtual Processors Clemens Grelck, Stephan Herhut, Chris Jesshope, Carl Joslin, Mike Lankamp, Sven-Bodo Scholz and Alex Shafarenko (2009). In 14th Workshop on Compilers for Parallel Computing (CPC'09), IBM Research Center, Zürich, Switzerland. 2009_4.pdf BibTexAbstract

@InProceedings{GrelJessJosletalCPC09,
  author    = {Clemens Grelck and Stephan Herhut and Chris Jesshope and Carl Joslin and Mike Lankamp and Sven-Bodo Scholz and Alex Shafarenko},
  title     = {Compiling the Functional Data-parallel Language Sac for Microgrids of Self-adaptive Virtual Processors},
  booktitle = {14th Workshop on Compilers for Parallel Computing (CPC'09), IBM Research Center, Z\"urich, Switzerland},
  year      = {2009},
  abstract  = {We present preliminary results from compiling the high- level, functional and data-parallel programming language SaC into a novel multi-core design: Microgrids of Self- Adaptive Virtual Processors (SVPs). The side-effect free nature of SaC in conjunction with its data-parallel foundation make it an ideal candidate for auto-parallelisation. Notwithstanding these favourable pre-conditions, scheduling and data-placement pose major challenges for effective parallelisation of irregular applications because fine-grained dynamic approaches in inflict large overheads on conventional architectures. The Microgrid architecture promises a radical shift: thread creation and context switches are implemented in hardware and cause negligible overhead. Likewise, scheduling of tasks to computing resources is catered for by hardware. This paper investigates aspects of the Microgrid architecture which may influence the overall performance of compiled data-parallel programs. In particular, we look at alternative thread creation schemes for n - dimensional, data-parallel operations and their effect on overall performance. Furthermore, we investigate the architecture's capability to cope with workload imbalances within such operations. The paper presents a sequence of experiments on a cycle-accurate emulator of the Microgrid architecture from which we derive some guiding principles for an effective compilation of data parallel operations. Based on these principles, we present a compilation scheme for the data-parallel core of SaC.},
  affil     = {ctca},
  category  = {par,design,core},
  topics    = {SAC},
  url       = {2009_4.pdf},
}

We present preliminary results from compiling the high- level, functional and data-parallel programming language SaC into a novel multi-core design: Microgrids of Self- Adaptive Virtual Processors (SVPs). The side-effect free nature of SaC in conjunction with its data-parallel foundation make it an ideal candidate for auto-parallelisation. Notwithstanding these favourable pre-conditions, scheduling and data-placement pose major challenges for effective parallelisation of irregular applications because fine-grained dynamic approaches in inflict large overheads on conventional architectures. The Microgrid architecture promises a radical shift: thread creation and context switches are implemented in hardware and cause negligible overhead. Likewise, scheduling of tasks to computing resources is catered for by hardware. This paper investigates aspects of the Microgrid architecture which may influence the overall performance of compiled data-parallel programs. In particular, we look at alternative thread creation schemes for n - dimensional, data-parallel operations and their effect on overall performance. Furthermore, we investigate the architecture's capability to cope with workload imbalances within such operations. The paper presents a sequence of experiments on a cycle-accurate emulator of the Microgrid architecture from which we derive some guiding principles for an effective compilation of data parallel operations. Based on these principles, we present a compilation scheme for the data-parallel core of SaC.

2006

Generic Programming on the Structure of Homogeneously Nested Arrays Stephan Herhut and Sven-Bodo Scholz (2006). In 7th Symposium on Trends in Functional Programming. pp. 351–366. University of Nottingham. gpotsohna.pdf BibTexAbstract

@InProceedings{HerhSchoTFP06,
  AUTHOR    = {Stephan Herhut and Sven-Bodo Scholz},
  EMAIL     = {},
  TITLE     = {Generic Programming on the Structure of Homogeneously Nested Arrays},
  EDITOR    = {Henrik Nilsson},
  BOOKTITLE = {7th Symposium on Trends in Functional Programming},
  PUBLISHER = {University of Nottingham},
  YEAR      = 2006,
  PAGES     = {351--366},
  NOTE      = {},
  KEYWORDS  = {},
  CONTENTS  = {},
  sourceURL = {},
  TOPICS    = {SAC},
  AFFIL     = {ctca},
  summary    = {},
  abstract   = {In this paper we propose a new means to model and operate on nested arrays that allows for a high level of abstraction without introducing a performance penalty. We achieve this by using a nesting structure on array types which allows us to shift the nesting information of arrays from the runtime representation level to the type system level. This information can then be exploited for generic function definitions on the nesting structure of arrays which, as we show, neatly integrates with subtyping based function overloading. Finally, we demonstrate for an example how nested arrays and generic function definitions can be fully stripped out using existing optimisation techniques},
  category   = {types, design},
  doi        = {},
  url        = {gpotsohna.pdf}
}

In this paper we propose a new means to model and operate on nested arrays that allows for a high level of abstraction without introducing a performance penalty. We achieve this by using a nesting structure on array types which allows us to shift the nesting information of arrays from the runtime representation level to the type system level. This information can then be exploited for generic function definitions on the nesting structure of arrays which, as we show, neatly integrates with subtyping based function overloading. Finally, we demonstrate for an example how nested arrays and generic function definitions can be fully stripped out using existing optimisation techniques

Sac: A Functional Array Language for Efficient Multithreaded Execution Clemens Grelck and Sven-Bodo Scholz (2006). International Journal of Parallel Programming 34 (4) pp. 383–427. DOIsafalfeme.pdf BibTexAbstract

@Article{GrelSchoIJPP06,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Sac: A Functional Array Language for Efficient Multithreaded Execution},
  journal   = {International Journal of Parallel Programming},
  year      = {2006},
  volume    = {34},
  number    = {4},
  pages     = {383--427},
  abstract  = {We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.},
  affil     = {ctca},
  category  = {core, design},
  contents  = {[ISSN: 0885-7458 (Paper) 1573-7640 (Online)]},
  doi       = {10.1007/s10766-006-0018-x},
  sourceurl = {http://dx.doi.org/10.1007/s10766-006-0018-x},
  topics    = {SAC},
  url       = {safalfeme.pdf},
}

We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.

Functional Array Programming in Sac Sven-Bodo Scholz (2006). In Central European Summer School on Functional Programming. Springer. DOIfapis.pdf BibTexAbstract

@InCollection{ScholzCEFP05,
  author     = {Sven-Bodo Scholz},
  title      = {Functional Array Programming in Sac},
  booktitle  = {Central European Summer School on Functional Programming},
  publisher  = {Springer},
  year       = {2006},
  editor     = {Zoltan Horvath},
  volume     = {4164},
  series     = {LNCS},
  abstract   = {These notes present an introduction into array-based programming from a functional, i.e., side-effect-free perspective. The first part focuses on promoting arrays as predominant, stateless data structure. This leads to a programming style that favors compositions of generic array operations that manipulate entire arrays over specifications that are made in an element-wise fashion. An algebraicly consistent set of such operations is defined and several examples are given demonstrating the expressiveness of the proposed set of operations. The second part shows how such a set of array operations can be defined within the first-order functional array language SaC. It does not only discuss the language design issues involved but it also tackles implementation issues that are crucial for achieving acceptable runtimes from such genericly specified array operations.},
  affil      = {ctca},
  category   = {design},
  doi        = {10.1007/11894100_3},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,on SAC},
  url        = {fapis.pdf},
}

These notes present an introduction into array-based programming from a functional, i.e., side-effect-free perspective. The first part focuses on promoting arrays as predominant, stateless data structure. This leads to a programming style that favors compositions of generic array operations that manipulate entire arrays over specifications that are made in an element-wise fashion. An algebraicly consistent set of such operations is defined and several examples are given demonstrating the expressiveness of the proposed set of operations. The second part shows how such a set of array operations can be defined within the first-order functional array language SaC. It does not only discuss the language design issues involved but it also tackles implementation issues that are crucial for achieving acceptable runtimes from such genericly specified array operations.

2005

A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o C. Grelck, S. Kuthe and S.-B. Scholz (2005). In 3rd Workshop on High-Level Parallel Programming and Applications (HLPP'05), Warwick, UK. pp. 147–160. University of Warwick, UK. DOIahsmemfadpl.pdf BibTexAbstract

@InProceedings{GrelKuthSchoHLPP05,
  author    = {C. Grelck and S. Kuthe and S.-B. Scholz},
  title     = {A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o},
  booktitle = {3rd Workshop on High-Level Parallel Programming and Applications (HLPP'05), Warwick, UK},
  year      = {2005},
  editor    = {A. Tiskin},
  pages     = {147--160},
  publisher = {University of Warwick, UK},
  abstract  = {We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.},
  affil     = {ctca},
  category  = {design, opt},
  doi       = {10.1142/S012962640800320X},
  topics    = {SAC},
  url       = {ahsmemfadpl.pdf},
}

We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.

Shared Memory Multiprocessor Support for Functional Array Processing in Sac Clemens Grelck (2005). Journal of Functional Programming 15 (3) pp. 353–401. DOIsmmsffapis.pdf BibTexAbstract

@Article{GrelckJFP05,
  author   = {Clemens Grelck},
  title    = {Shared Memory Multiprocessor Support for Functional Array Processing in Sac},
  journal  = {Journal of Functional Programming},
  year     = {2005},
  volume   = {15},
  number   = {3},
  pages    = {353--401},
  abstract = {Classical application domains of parallel computing are dominated by processing large arrays of numerical data. Whereas most functional languages focus on lists and trees rather than on arrays, SAC is tailor-made in design and in implementation for efficient high-level array processing. Advanced compiler optimizations yield performance levels that are often competitive with low-level imperative implementations. Based on SAC, we develop compilation techniques and runtime system support for the compiler-directed parallel execution of high-level functional array processing code on shared memory architectures. Competitive sequential performance gives us the opportunity to exploit the conceptual advantages of the functional paradigm for achieving real performance gains with respect to existing imperative implementations, not only in comparison with uniprocessor runtimes. While the design of SAC facilitates parallelization, the particular challenge of high sequential performance is that realization of satisfying speedups through parallelization becomes substantially more difficult. We present an initial compilation scheme and multi-threaded execution model, which we step-wise refine to reduce organizational overhead and to improve parallel performance. We close with a detailed analysis of the impact of certain design decisions on runtime performance, based on a series of experiments.},
  category = {design, par},
  contents = {Canonical reference for SAC multithreading},
  doi      = {10.1017/S0956796805005538},
  topics   = {SAC},
  url      = {smmsffapis.pdf},
}

Classical application domains of parallel computing are dominated by processing large arrays of numerical data. Whereas most functional languages focus on lists and trees rather than on arrays, SAC is tailor-made in design and in implementation for efficient high-level array processing. Advanced compiler optimizations yield performance levels that are often competitive with low-level imperative implementations. Based on SAC, we develop compilation techniques and runtime system support for the compiler-directed parallel execution of high-level functional array processing code on shared memory architectures. Competitive sequential performance gives us the opportunity to exploit the conceptual advantages of the functional paradigm for achieving real performance gains with respect to existing imperative implementations, not only in comparison with uniprocessor runtimes. While the design of SAC facilitates parallelization, the particular challenge of high sequential performance is that realization of satisfying speedups through parallelization becomes substantially more difficult. We present an initial compilation scheme and multi-threaded execution model, which we step-wise refine to reduce organizational overhead and to improve parallel performance. We close with a detailed analysis of the impact of certain design decisions on runtime performance, based on a series of experiments.

Generic Array Programming in Sac Clemens Grelck and Sven-Bodo Scholz (2005). In 21. Workshop der GI-Fachgruppe 2.1.4 Programmiersprachen und Rechenkonzepte, Bad Honnef, Germany. pp. 43–53. University of Kiel, Institute of Computer Science and Applied Mathematics. gapis.pdf BibTexAbstract

@InProceedings{GrelSchoGIPR04,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Generic Array Programming in Sac},
  booktitle = {21.~Workshop der GI-Fachgruppe 2.1.4 Programmiersprachen und Rechenkonzepte, Bad Honnef, Germany},
  year      = {2005},
  editor    = {Wolfgang Goerigk},
  volume    = {0410},
  series    = {Technischer Bericht},
  pages     = {43--53},
  publisher = {University of Kiel, Institute of Computer Science and Applied Mathematics},
  abstract  = {SaC is a purely functional array processing language for computationally intensive numerical applications. Its design aims at combining efficiency in program construction with efficiency in parallel program execution. We demonstrate the declarative, generic programming style of SaC by means of a small case study: 3-dimensional complex fast-Fourier transforms. The impact of abstraction on expressiveness, readability, and maintainability of code as well as on clarity of underlying mathematical concepts is discussed and compared with other approaches. We quantify the associated impact on runtime performance both in uniprocessor and in multiprocessor environments.},
  affil     = {ctca},
  category  = { design},
  topics    = {SAC},
  url       = {GAPIS.pdf},
}

SaC is a purely functional array processing language for computationally intensive numerical applications. Its design aims at combining efficiency in program construction with efficiency in parallel program execution. We demonstrate the declarative, generic programming style of SaC by means of a small case study: 3-dimensional complex fast-Fourier transforms. The impact of abstraction on expressiveness, readability, and maintainability of code as well as on clarity of underlying mathematical concepts is discussed and compared with other approaches. We quantify the associated impact on runtime performance both in uniprocessor and in multiprocessor environments.

2004

Implicit Memory Management for Sac Clemens Grelck and Kai Trojahner (2004). In Implementation and Application of Functional Languages, 16th International Workshop, IFL'04. pp. 335–348. University of Kiel, Institute of Computer Science and Applied Mathematics. greltrojifl04.pdf BibTexAbstract

@InProceedings{GrelTrojIFL04,
  author    = {Clemens Grelck and Kai Trojahner},
  title     = {Implicit Memory Management for Sac},
  booktitle = {Implementation and Application of Functional Languages, 16th International Workshop, IFL'04},
  year      = {2004},
  editor    = {Clemens Grelck and Frank Huch},
  pages     = {335--348},
  publisher = {University of Kiel, Institute of Computer Science and Applied Mathematics},
  note      = {Technical Report 0408},
  abstract  = {While almost all functional languages rely on garbage col- lection for implicit memory management, the needs of efficient array processing are better suited by reference counting. The opportunities to reclaim unused memory immediately and to implement functionally sound array operations by destructive in-place updates turn out to be essential for approaching the performance achieved by imperative lan- guages. In this paper we outline the realization of implicit memory management in the functional array language SaC. Starting with basic schemes for the introduction of memory management and reference counting instructions into SaC code, the emphasis is laid on a range of optimizations that aim at reducing runtime overhead and exploiting memory reuse opportuni- ties.},
  category  = {design, opt},
  topics    = {SAC},
  url       = {greltrojifl04.pdf},
}

While almost all functional languages rely on garbage col- lection for implicit memory management, the needs of efficient array processing are better suited by reference counting. The opportunities to reclaim unused memory immediately and to implement functionally sound array operations by destructive in-place updates turn out to be essential for approaching the performance achieved by imperative lan- guages. In this paper we outline the realization of implicit memory management in the functional array language SaC. Starting with basic schemes for the introduction of memory management and reference counting instructions into SaC code, the emphasis is laid on a range of optimizations that aim at reducing runtime overhead and exploiting memory reuse opportuni- ties.

Towards Fully Controlled Overloading across Module Boundaries S. Herhut and S.-B. Scholz (2004). In 16th International Workshop on the Implementation and Application of Functional Languages (IFL'04), Lübeck, Germany. pp. 395–408. University of Kiel. DOItfcoamb.pdf BibTexAbstract

@InProceedings{HerhSchoIFL04,
  author    = {S. Herhut and S.-B. Scholz},
  title     = {Towards Fully Controlled Overloading across Module Boundaries},
  booktitle = {16th International Workshop on the Implementation and Application of Functional Languages (IFL'04), L\"ubeck, Germany},
  year      = {2004},
  editor    = {C. Grelck and F. Huch},
  pages     = {395--408},
  publisher = {University of Kiel},
  abstract  = {This paper proposes a set of modularisation constructs as well as a new implementation technique for overloading functions across module boundaries. In contrast to existing approaches, it allows to fully preserve separation of namespaces without restricting the potential for later overloadings, including mutual recursion across module boundaries. This gives the programmer control over the visibility of individual func-tion definitions within each namespace and, thus, allows several different sets of instances for a single overloaded function to be used within one program. Based on a very simple applied λ-calculus as core language, the mod-ularisation constructs are defined and a transformation scheme into an applied λ-calculus is presented. Furthermore, an outline of an implemen-tation in the context of the functional programming language SaC is given.},
  affil     = {ctca},
  category  = {design, modules},
  doi       = {10.1.1.408.4298},
  topics    = {SAC,on SAC},
  url       = {TFCOAMB.pdf},
}

This paper proposes a set of modularisation constructs as well as a new implementation technique for overloading functions across module boundaries. In contrast to existing approaches, it allows to fully preserve separation of namespaces without restricting the potential for later overloadings, including mutual recursion across module boundaries. This gives the programmer control over the visibility of individual func-tion definitions within each namespace and, thus, allows several different sets of instances for a single overloaded function to be used within one program. Based on a very simple applied λ-calculus as core language, the mod-ularisation constructs are defined and a transformation scheme into an applied λ-calculus is presented. Furthermore, an outline of an implemen-tation in the context of the functional programming language SaC is given.

With-loop Scalarization: Merging Nested Array Operations Clemens Grelck, Sven-Bodo Scholz and Kai Trojahner (2004). In Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers. Springer. DOIwlsmnao.pdf BibTexAbstract

@InProceedings{GrelSchoTrojIFL03,
  author     = {Clemens Grelck and Sven-Bodo Scholz and Kai Trojahner},
  title      = {With-loop Scalarization: Merging Nested Array Operations},
  booktitle  = {Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers},
  year       = {2004},
  editor     = {Phil Trinder and Greg Michaelson},
  volume     = {3145},
  series     = {Lecture Notes in Computer Science},
  publisher  = {Springer},
  abstract   = {Construction of complex array operations by composition of more basic ones allows for abstract and concise specifications of algorithms. Unfortunately, na¨ıve compilation of such specifications leads to creation of many temporary arrays at runtime and, consequently, to poor performance characteristics. This paper elaborates on a new compiler optimization, named withloop-scalarization, which aims at eliminating temporary arrays in the context of nested array operations. It is based on with-loops, a versatile array comprehension construct used by the functional array language SaC both for specification as well as for internal representation of array operations. The impact of with-loop scalarization on the runtime performance of compiled SaC code is demonstrated by several experiments involving support for arithmetic on arrays of complex numbers and the application kernel FT from the NAS benchmark suite.},
  category   = {design, opt},
  doi        = {10.1007/978-3-540-27861-0_8},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries},
  url        = {WLSMNAO.pdf},
}

Construction of complex array operations by composition of more basic ones allows for abstract and concise specifications of algorithms. Unfortunately, na¨ıve compilation of such specifications leads to creation of many temporary arrays at runtime and, consequently, to poor performance characteristics. This paper elaborates on a new compiler optimization, named withloop-scalarization, which aims at eliminating temporary arrays in the context of nested array operations. It is based on with-loops, a versatile array comprehension construct used by the functional array language SaC both for specification as well as for internal representation of array operations. The impact of with-loop scalarization on the runtime performance of compiled SaC code is demonstrated by several experiments involving support for arithmetic on arrays of complex numbers and the application kernel FT from the NAS benchmark suite.

2003

A Compiler Backend for Generic Programming with Arrays Dietmar Kreye (2003). acbfgpwa.pdf BibTex

@PhdThesis{Kreye03,
  author = {Dietmar Kreye},
  title  = {A Compiler Backend for Generic Programming with Arrays},
  school = {Institute of Computer Science and Applied Mathematics, University of Kiel, Germany},
  year   = {2003},
  note   = {Shaker Verlag, Aachen, 2003},
  topics = {SAC},
  summary    = {},
  abstract   = {},
  category   = {design},
  doi        = {},
  url        = {ACBFGPWA.pdf} 
}

Sac — from High-level Programming with Arrays to Efficient Parallel Execution Clemens Grelck and Sven-Bodo Scholz (2003). Parallel Processing Letters 13 (3) pp. 401–412. DOIsfhlpwatepe.pdf BibTexAbstract

@Article{GrelSchoPPL03,
  author   = {Clemens Grelck and Sven-Bodo Scholz},
  title    = {Sac --- from High-level Programming with Arrays to Efficient Parallel Execution},
  journal  = {Parallel Processing Letters},
  year     = {2003},
  volume   = {13},
  number   = {3},
  pages    = {401--412},
  abstract = {SAC is a purely functional array processing language designed with numerical applications in mind. It supports generic, high-level program specifications in the style of APL. However, rather than providing a fixed set of built-in array operations, SAC provides means to specify such operations in the language itself in a way that still allows their application to arrays of any rank and size. This paper illustrates the major steps in compiling generic, rank- and shape-invariant SAC specification into efficiently executable multithreaded code for parallel execution on shared memory multiprocessors. The effectiveness of the compilation techniques is demonstrated by means of a small case study on the PDE1 benchmark, which implements 3-dimensional red/black successive over-relaxation. Comparisons with HPF and ZPL show that despite the genericity of code, SAC achieves highly competitive runtime performance characteristics.},
  category = {core,design},
  doi      = {10.1142/S0129626403001379},
  topics   = {SAC},
  url      = {SFHLPWATEPE.pdf},
}

SAC is a purely functional array processing language designed with numerical applications in mind. It supports generic, high-level program specifications in the style of APL. However, rather than providing a fixed set of built-in array operations, SAC provides means to specify such operations in the language itself in a way that still allows their application to arrays of any rank and size. This paper illustrates the major steps in compiling generic, rank- and shape-invariant SAC specification into efficiently executable multithreaded code for parallel execution on shared memory multiprocessors. The effectiveness of the compilation techniques is demonstrated by means of a small case study on the PDE1 benchmark, which implements 3-dimensional red/black successive over-relaxation. Comparisons with HPF and ZPL show that despite the genericity of code, SAC achieves highly competitive runtime performance characteristics.

Single Assignment C — Efficient Support for High-level Array Operations in a Functional Setting Sven-Bodo Scholz (2003). Journal of Functional Programming 13 (6) pp. 1005–1059. DOIsacesfhlaoiafs.pdf BibTexAbstract

@Article{ScholzJFP03,
  author   = {Sven-Bodo Scholz},
  title    = {Single Assignment C --- Efficient Support for High-level Array Operations in a Functional Setting},
  journal  = {Journal of Functional Programming},
  year     = {2003},
  volume   = {13},
  number   = {6},
  pages    = {1005--1059},
  contents = {Canonical reference for SAC},
  topics   = {SAC},
  summary    = {},
  abstract   = {This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shapeinvariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC for short), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several highlevel optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.},
  category   = {core, design},
  doi        = {10.1.1.138.6995},
  url        = {SACESFHLAOIAFS.pdf},
}

This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shapeinvariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC for short), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several highlevel optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.

2000

DOIsac2c-codegen-wl-lochem-99.pdf BibTexAbstract

@InProceedings{GrelKreySchoIFL99,
  author     = {Clemens Grelck and Dietmar Kreye and Sven-Bodo Scholz},
  title      = {On Code Generation for Multi-generator With-loops in Sac},
  booktitle  = {Implementation of Functional Languages, 11th International Workshop (IFL'99), Lochem, The Netherlands, Selected Papers},
  year       = {2000},
  editor     = {Pieter Koopman and Chris Clack},
  volume     = {1868},
  series     = {Lecture Notes in Computer Science},
  pages      = {77--94},
  publisher  = {Springer},
  abstract   = {Most array operations in Sac are specified in terms of so-called with-loops, a sac-specific form of array comprehension. Due to the map-like semantics of with-loops its loop instances can be computed in any order which provides considerable freedom when it comes to compiling them into nestings of for-loops in C. This paper discusses several different execution orders and their impact on compilation complexity, size of generated code, and execution runtimes. As a result, a multiply parameterized compilation scheme is proposed which achieves speedups of up to a factor of 16 when compared against a naïve compilation scheme.},
  category   = {core,design},
  doi        = {10.1007/10722298_5},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper presents the essentials of Dietmar's Diploma thesis (see below). Besides an informal presentation of the compilation steps involved, the design choices made are substantiated by means of a few performance measurements.},
  topics     = {SAC,Loop Transformation},
  url        = {sac2c-codegen-WL-lochem-99.pdf},
}

Most array operations in Sac are specified in terms of so-called with-loops, a sac-specific form of array comprehension. Due to the map-like semantics of with-loops its loop instances can be computed in any order which provides considerable freedom when it comes to compiling them into nestings of for-loops in C. This paper discusses several different execution orders and their impact on compilation complexity, size of generated code, and execution runtimes. As a result, a multiply parameterized compilation scheme is proposed which achieves speedups of up to a factor of 16 when compared against a naïve compilation scheme.

1999

DOIsac-accel-apl-scranton-99.pdf BibTexAbstract

@InProceedings{GrelSchoAPL99,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Accelerating Apl Programs with Sac},
  booktitle = {International Conference on Array Processing Languages (APL'99), Scranton, Pennsylvania, USA},
  year      = {1999},
  editor    = {Olivier Lefèvre},
  volume    = {29},
  number    = {2},
  series    = {APL Quote Quad},
  pages     = {50--57},
  publisher = {ACM Press},
  abstract  = {The paper investigates, how SAC, a purely functional language based on C syntax, relates to APL in terms of expressiveness and run-time behavior. To do so, three different excerpts of real world APL programs are examined. It is shown that after defining the required APL primitives in SAC, the example programs can be re-written in SAC with an almost one-to-one correspondence. Run-time comparisons between interpreting APL programs and compiled SAC programs show that speedups due to compilation vary between 2 and 500 for three representative benchmark programs.},
  category  = {core,apps,design},
  doi       = {10.1145/379277.312719},
  summary   = {This paper compares SAC to APL in terms of expressiveness and execution runtimes. Some APL applications are compiled by hand into almost identical SAC programs and the resulting execution times are discussed.},
  topics    = {SAC,APL},
  url       = {sac-accel-apl-scranton-99.pdf},
}

The paper investigates, how SAC, a purely functional language based on C syntax, relates to APL in terms of expressiveness and run-time behavior. To do so, three different excerpts of real world APL programs are examined. It is shown that after defining the required APL primitives in SAC, the example programs can be re-written in SAC with an almost one-to-one correspondence. Run-time comparisons between interpreting APL programs and compiled SAC programs show that speedups due to compilation vary between 2 and 500 for three representative benchmark programs.

DOImt-support-london-98.pdf BibTexAbstract

@InProceedings{GrelckIFL98,
  author     = {Clemens Grelck},
  title      = {Shared Memory Multiprocessor Support for Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {38--54},
  publisher  = {Springer},
  abstract   = {Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.},
  category   = {core,design,par,opt},
  doi        = {10.1007/3-540-48515-5_3},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper introduces the basic concepts of multi-threaded program execution on shared memory multiprocessor systems. The described approach is completely implicit, i.e. the compiler and the runtime system handle everything from granularity to load balancing on their own. The paper also contains preliminary performance evaluations.},
  topics     = {SAC,Shared Memory},
  url        = {mt-support-london-98.pdf},
}

Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.

1998

DOIsac-defining-array-ops-rome-98.pdf BibTexAbstract

@InProceedings{ScholzAPL98,
  author    = {Sven-Bodo Scholz},
  title     = {On Defining Application-specific High-level Array Operations by Means of Shape-invariant Programming Facilities},
  booktitle = {International Conference on Array Processing Languages (APL'98), Rome, Italy},
  year      = {1998},
  editor    = {Sergio Picchi and Marco Micocci},
  pages     = {40--45},
  publisher = {ACM Press},
  abstract  = {Most of the existing high-level array-processing languages support a fixed set of pre-defined array operations and a few higher-order functions for constructing new array operations from existing ones. In this paper, we discuss a more general approach made feasible by SAC (for Single Assignement C), a functional variant of C.SAC provides a meta-level language construct called WITH-loop which may be considered a sophisticated variant of the FORALL-loops in HPF or of array comprehensions in functional languages. It allows for the element-wise specification of high-level operations on arrays of any dimensionality: any set of high-level array operations can be specified by means of WITH-loops and be made available in a library. This not only improves the flexibility of specifications, but also simplifies the compilation process.By means of a few examples it is shown that the high-level operations that are typically available in array processing languages such as APL or FORTRAN90 can be easily specified as WITH-loops in SAC. Furthermore, we briefly outline the most important optimization techniques used in the current SAC compiler for achieving efficiently executable code.The paper finally presents a performance comparison between a high-level specification for the multigrid relaxation kernel of the NAS benchmarks in SAC on the one hand and low-level specifications in SISAL and in FORTRAN77 on the other hand. It shows that the SAC implementation, despite its higher level of abstraction, is competitive with the other two both in terms of program runtimes and memory consumption.},
  category  = {core,design},
  doi       = {10.1145/327559.327613},
  summary   = {This paper shows how APL programs can be translated into equivalent but substantially faster SAC programs with little effort.},
  topics    = {SAC},
  url       = {sac-defining-array-ops-rome-98.pdf},
}

Most of the existing high-level array-processing languages support a fixed set of pre-defined array operations and a few higher-order functions for constructing new array operations from existing ones. In this paper, we discuss a more general approach made feasible by SAC (for Single Assignement C), a functional variant of C.SAC provides a meta-level language construct called WITH-loop which may be considered a sophisticated variant of the FORALL-loops in HPF or of array comprehensions in functional languages. It allows for the element-wise specification of high-level operations on arrays of any dimensionality: any set of high-level array operations can be specified by means of WITH-loops and be made available in a library. This not only improves the flexibility of specifications, but also simplifies the compilation process.By means of a few examples it is shown that the high-level operations that are typically available in array processing languages such as APL or FORTRAN90 can be easily specified as WITH-loops in SAC. Furthermore, we briefly outline the most important optimization techniques used in the current SAC compiler for achieving efficiently executable code.The paper finally presents a performance comparison between a high-level specification for the multigrid relaxation kernel of the NAS benchmarks in SAC on the one hand and low-level specifications in SISAL and in FORTRAN77 on the other hand. It shows that the SAC implementation, despite its higher level of abstraction, is competitive with the other two both in terms of program runtimes and memory consumption.

With-loop-folding in Sac — Condensing Consecutive Array Operations Sven-Bodo Scholz (1998). In Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers. pp. 72–92. Springer. DOIwlf-st-andrews-97.pdf BibTexAbstract

@InProceedings{ScholzIFL1997,
  author     = {Sven-Bodo Scholz},
  title      = {With-loop-folding in Sac --- Condensing Consecutive Array Operations},
  booktitle  = {Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers},
  year       = {1998},
  editor     = {Chris Clack and Tony Davie and Kevin Hammond},
  volume     = {1467},
  series     = {Lecture Notes in Computer Science},
  pages      = {72--92},
  publisher  = {Springer},
  abstract   = {This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.},
  category   = {core,design,opt},
  doi        = {10.1007/BFb0055425},
  isbn       = {978-3-540-64849-9},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries,Implementation of Arrays},
  url        = {wlf-st-andrews-97.pdf},
}

This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.

1997

An Overview of Sc Sac – a Functional Language for Numerical Applications Sven-Bodo Scholz (1997). In Programming Languages and Fundamentals of Programming, Technical Report 9717. Institut für Informatik und Praktische Mathe\-matik, Universität Kiel. BibTex

@InProceedings{ScholzKPS97,
  author    = {Sven-Bodo Scholz},
  title     = {An Overview of Sc Sac -- a Functional Language for Numerical Applications},
  booktitle = {Programming Languages and Fundamentals of Programming, Technical Report 9717},
  year      = {1997},
  editor    = {R. Berghammer and F. Simon},
  publisher = {{Institut f{\"u}r Informatik und Praktische Mathe\-matik, Universit{\"a}t Kiel}},
  category  = {core,design,apps},
  topics    = {SAC,on SAC},
}

1996

@PhdThesis{Scholz96,
  author   = {Sven-Bodo Scholz},
  title    = {Single Assignment C -- Entwurf Und Implementierung Einer Funktionalen C-variante Mit Spezieller Unterstützung Shape-invarianter Array-operationen},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  note     = {Shaker Verlag, Aachen, 1997},
  category = {design,core,types,apps},
  summary  = {Sven-Bodo's PhD thesis is the most comprehensive document about SAC. The language design is described in detail. Moreover, a formal semantics of SAC is given. Further points of emphasis are the type system and the compilation process. Multigrid relaxation is used as a case study to illustrate programming with SAC.},
  topics   = {SAC,Language Manuals,Implementation of Arrays,Language Design},
  url      = {sac-design-sbs-phd-96.pdf},
}

@MastersThesis{Grelck96,
  author   = {Clemens Grelck},
  title    = {Integration Eines Modul- Und Klassen-konzeptes in Die Funktionale Programmiersprache Sac -- Single Assignment C},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  category = {design,core,modules},
  summary  = {Clemens' Diploma thesis is a detailed description of the module system, the integration of states and state modifications including the realization of powerful I/O facilities, and the foreign language interface which allows to use functions implemented in C within SAC programs. Both language design and implementation issues are covered.},
  topics   = {SAC,Uniqueness Types,Language Interfacing},
  url      = {sac2c-modules-classes-dipl-cg-96.pdf},
}

1995

sac-classes-objects-bastad-95.pdf BibTexAbstract

@InProceedings{GrelSchoIFL95,
  author    = {Grelck, Clemens and Scholz, Sven-Bodo},
  title     = {Classes and Objects As Basis for I/o in Sac},
  booktitle = {7th International Workshop on Implementation of Functional Languages (IFL'95), B{\aa}stad, Sweden},
  year      = {1995},
  editor    = {Thomas Johnsson},
  pages     = {30--44},
  publisher = {Chalmers University of Technology, Gothenburg, Sweden},
  abstract  = {In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce
and manipulate states of arbitrary data-structures within Sac.},
  category  = {design,core,states},
  summary   = {This paper describes the way states and state modifications are incorporated into the purely functional world of SAC based on uniqueness typing and an extension of the module system. The integration of I/O facilities is taken as a case study. The paper is the best English introduction to this part of SAC.},
  topics    = {SAC,Uniqueness Types},
  url       = {sac-classes-objects-bastad-95.pdf},
}

In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce and manipulate states of arbitrary data-structures within Sac.

1994

@InProceedings{ScholzIFL94,
  author    = {Sven-Bodo Scholz},
  title     = {Single-assignment C --- Functional Programming Using Imperative Style},
  booktitle = {6th International Workshop on Implementation of Functional Languages (IFL'94), Norwich, England, UK},
  year      = {1994},
  editor    = {John Glauert},
  pages     = {211--2113},
  publisher = {University of East Anglia, Norwich, England, UK},
  category  = {design,core,types},
  summary   = {This is the first paper to propose SAC. The basic language design ideas are explained and its syntax is defined. However, SAC has changed a lot since then. So, this classic paper can no longer be recommended as an introductory source of information about SAC.},
  topics    = {SAC,Implementation of Arrays,Language Design},
  url       = {sac-overview-norwich-94.pdf},
}

2015

Type-driven Data Layouts for Improved Vectorisation Artjoms Šinkarovs and Sven-Bodo Scholz (May 2015). Concurrency and Computation: Practice and Experience 28 (7) pp. 2092–2119. Wiley-Blackwell. DOI BibTexAbstract

@Article{SinkSchoCCPE15,
  author    = {Šinkarovs, Artjoms and Scholz, Sven-Bodo},
  title     = {Type-driven Data Layouts for Improved Vectorisation},
  journal   = {Concurrency and Computation: Practice and Experience},
  year      = {2015},
  volume    = {28},
  number    = {7},
  pages     = {2092--2119},
  month     = {May},
  abstract  = {Vector instructions of modern CPUs are crucially important for the performance of compute-intensive algorithms. Auto-vectorisation often fails because of an unfortunate choice of data layout by the programmer. This paper proposes a data layout inference for auto-vectorisation that identifies layout transformations that convert single instruction, multiple data-unfavourable layouts of data structures into favourable ones. We present a type system for layout transformations, and we sketch an inference algorithm for it. Finally, we present some initial performance figures for the impact of the inferred layout transformations. They show that non-intuitive layouts that are inferred through our system can have a vast performance impact on compute intensive programs.},
  category  = {types,opt,par},
  doi       = {10.1002/cpe.3501},
  issn      = {1532-0626},
  keywords  = {Data parallelism, Hpc, Vectorisation},
  publisher = {Wiley-Blackwell},
}

Vector instructions of modern CPUs are crucially important for the performance of compute-intensive algorithms. Auto-vectorisation often fails because of an unfortunate choice of data layout by the programmer. This paper proposes a data layout inference for auto-vectorisation that identifies layout transformations that convert single instruction, multiple data-unfavourable layouts of data structures into favourable ones. We present a type system for layout transformations, and we sketch an inference algorithm for it. Finally, we present some initial performance figures for the impact of the inferred layout transformations. They show that non-intuitive layouts that are inferred through our system can have a vast performance impact on compute intensive programs.

2008

From Contracts Towards Dependent Types: Proofs by Partial Evaluation Stephan Herhut, Sven-Bodo Scholz, Robert Bernecky, Clemens Grelck and Kai Trojahner (2008). In 19th International Symposium on Implementation and Application of Functional Languages (IFL'07), Freiburg, Germany, Revised Selected Papers. pp. 254–273. Springer. DOIfctdt.pdf BibTexAbstract

@InProceedings{HerhSchoBernetalIFL07,
  author     = {Stephan Herhut and Sven-Bodo Scholz and Robert Bernecky and Clemens Grelck and Kai Trojahner},
  title      = {From Contracts Towards Dependent Types: Proofs by Partial Evaluation},
  booktitle  = {19th International Symposium on Implementation and Application of Functional Languages (IFL'07), Freiburg, Germany, Revised Selected Papers},
  year       = {2008},
  editor     = {Olaf Chitil and Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
  volume     = {5083},
  series     = {Lecture Notes in Computer Science},
  pages      = {254--273},
  publisher  = {Springer},
  affil      = {ctca},
  isbn       = {978-3-540-85372-5},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  summary    = {},
  abstract   = {The specification and resolution of non-trivial domain constraints has become a well-recognised measure for improving the stability of large software systems. In this paper we propose an approach based on partial evaluation which tries to prove such constraints statically as far as possible and inserts efficient dynamic checks otherwise.},
  category   = {types, opt},
  doi        = {10.1007/978-3-540-85373-2_15},
  url        = {fctdt.pdf} 
}

The specification and resolution of non-trivial domain constraints has become a well-recognised measure for improving the stability of large software systems. In this paper we propose an approach based on partial evaluation which tries to prove such constraints statically as far as possible and inserts efficient dynamic checks otherwise.

2007

Dependently Typed Array Programs Don't Go Wrong Kai Trojahner and Clemens Grelck (2007). In 19th Nordic Workshop on Programming Theory (NWPT'07), Oslo, Norway, October 10-12, 2007. pp. 64–66. University of Oslo, Institute of Informatics. DOIdtapdgw.pdf BibTexAbstract

@Article{TrojGrelNWPT07,
  AUTHOR    = {Kai Trojahner and Clemens Grelck},
  TITLE     = {Dependently Typed Array Programs Don't Go Wrong},
  EDITOR    = {Einar Broch Johnsen and Olaf Owe and Gerardo Schneider},
  BOOKTITLE = {19th Nordic Workshop on Programming Theory (NWPT'07), Oslo, Norway, October 10-12, 2007},
  PUBLISHER = {University of Oslo, Institute of Informatics},
  SERIES    = {Research Report},
  VOLUME    = {366},
  NUMBER    = {},
  YEAR      = 2007,
  PAGES     = {64--66},
  NOTE      = {[ISBN 82-7368-324-9]},
  CONTENTS  = {},
  sourceURL = {},
  TOPICS    = {},
  summary    = {},
  abstract   = { The array programming paradigm adopts multidimensional arrays as the fundamental data structures of computation. Array operations process entire arrays instead of just single elements. This makes array programs highly expressive and introduces data parallelism in a natural way. Array programming imposes non-trivial structural constraints on ranks, shapes, and element values of arrays. A prominent example of such violations are out-of-bound array accesses. Usually, such constraints are enforced by means of run time checks. Both the run time overhead inflicted by dynamic constraint checking and the uncertainty of proper program evaluation are undesirable. In this paper, we propose a novel type system for array programs based on dependent types. Our type system makes dynamic constraint checks obsolete and 3guarantees orderly evaluation of well-typed programs. We employ integer vectors of statically unknown length to index array types. We also show how constraints on these vectors are resolved using a suitable reduction to integer scalars. Our presentation is based on a functional array calculus that captures the essence of the paradigm without the legacy and obfuscation of a fully-fledged array programming language.},
  category   = {types},
  doi        = {http://dx.doi.org/10.1016/j.jlap.2009.03.002},
  url        = {dtapdgw.pdf},
}

The array programming paradigm adopts multidimensional arrays as the fundamental data structures of computation. Array operations process entire arrays instead of just single elements. This makes array programs highly expressive and introduces data parallelism in a natural way. Array programming imposes non-trivial structural constraints on ranks, shapes, and element values of arrays. A prominent example of such violations are out-of-bound array accesses. Usually, such constraints are enforced by means of run time checks. Both the run time overhead inflicted by dynamic constraint checking and the uncertainty of proper program evaluation are undesirable. In this paper, we propose a novel type system for array programs based on dependent types. Our type system makes dynamic constraint checks obsolete and 3guarantees orderly evaluation of well-typed programs. We employ integer vectors of statically unknown length to index array types. We also show how constraints on these vectors are resolved using a suitable reduction to integer scalars. Our presentation is based on a functional array calculus that captures the essence of the paradigm without the legacy and obfuscation of a fully-fledged array programming language.

2006

Generic Programming on the Structure of Homogeneously Nested Arrays Stephan Herhut and Sven-Bodo Scholz (2006). In 7th Symposium on Trends in Functional Programming. pp. 351–366. University of Nottingham. gpotsohna.pdf BibTexAbstract

@InProceedings{HerhSchoTFP06,
  AUTHOR    = {Stephan Herhut and Sven-Bodo Scholz},
  EMAIL     = {},
  TITLE     = {Generic Programming on the Structure of Homogeneously Nested Arrays},
  EDITOR    = {Henrik Nilsson},
  BOOKTITLE = {7th Symposium on Trends in Functional Programming},
  PUBLISHER = {University of Nottingham},
  YEAR      = 2006,
  PAGES     = {351--366},
  NOTE      = {},
  KEYWORDS  = {},
  CONTENTS  = {},
  sourceURL = {},
  TOPICS    = {SAC},
  AFFIL     = {ctca},
  summary    = {},
  abstract   = {In this paper we propose a new means to model and operate on nested arrays that allows for a high level of abstraction without introducing a performance penalty. We achieve this by using a nesting structure on array types which allows us to shift the nesting information of arrays from the runtime representation level to the type system level. This information can then be exploited for generic function definitions on the nesting structure of arrays which, as we show, neatly integrates with subtyping based function overloading. Finally, we demonstrate for an example how nested arrays and generic function definitions can be fully stripped out using existing optimisation techniques},
  category   = {types, design},
  doi        = {},
  url        = {gpotsohna.pdf}
}

In this paper we propose a new means to model and operate on nested arrays that allows for a high level of abstraction without introducing a performance penalty. We achieve this by using a nesting structure on array types which allows us to shift the nesting information of arrays from the runtime representation level to the type system level. This information can then be exploited for generic function definitions on the nesting structure of arrays which, as we show, neatly integrates with subtyping based function overloading. Finally, we demonstrate for an example how nested arrays and generic function definitions can be fully stripped out using existing optimisation techniques

1996

@PhdThesis{Scholz96,
  author   = {Sven-Bodo Scholz},
  title    = {Single Assignment C -- Entwurf Und Implementierung Einer Funktionalen C-variante Mit Spezieller Unterstützung Shape-invarianter Array-operationen},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  note     = {Shaker Verlag, Aachen, 1997},
  category = {design,core,types,apps},
  summary  = {Sven-Bodo's PhD thesis is the most comprehensive document about SAC. The language design is described in detail. Moreover, a formal semantics of SAC is given. Further points of emphasis are the type system and the compilation process. Multigrid relaxation is used as a case study to illustrate programming with SAC.},
  topics   = {SAC,Language Manuals,Implementation of Arrays,Language Design},
  url      = {sac-design-sbs-phd-96.pdf},
}

1994

@InProceedings{ScholzIFL94,
  author    = {Sven-Bodo Scholz},
  title     = {Single-assignment C --- Functional Programming Using Imperative Style},
  booktitle = {6th International Workshop on Implementation of Functional Languages (IFL'94), Norwich, England, UK},
  year      = {1994},
  editor    = {John Glauert},
  pages     = {211--2113},
  publisher = {University of East Anglia, Norwich, England, UK},
  category  = {design,core,types},
  summary   = {This is the first paper to propose SAC. The basic language design ideas are explained and its syntax is defined. However, SAC has changed a lot since then. So, this classic paper can no longer be recommended as an introductory source of information about SAC.},
  topics    = {SAC,Implementation of Arrays,Language Design},
  url       = {sac-overview-norwich-94.pdf},
}

2022

DOIcuyckscholzifl21.pdf BibTexAbstract

@inproceedings{CuycSchoIFL21,
 author = {van Cuyck, Gijs and Scholz, Sven-Bodo},
 title = {In-Place-Folding of Non-Scalar Hyper-Planes of Multi-Dimensional Arrays},
 year = {2022},
 isbn = {9781450386449},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {CuyckScholzIFL21.pdf},
 doi = {10.1145/3544885.3544893},
 summary = {Doing non-scalar folds in place.},
 category = {core, opt},
 abstract = {Memory management plays a key role when trying to compile functional programs into efficiently executable code. In particular when using flat representations for multi-dimensional arrays, i.e., when using a single memory block for the entire data of a multi-dimensional array, in-place updates become crucial for highly competitive performance. This paper proposes a novel code generation technique for performing fold-operations on hyper-planes of multi-dimensional arrays, where the fold-operation itself operates on non-scalar sub-arrays, i.e., on vectors or higher-dimensional arrays. This technique allows for a single result array allocation over the entire folding operation without requiring the folding operation itself to be scalarised. It enables the utilisation of vector operations without any added memory allocation or copying overhead. We describe our technique in the context of SaC, sketch our implementation in the context of the compiler sac2c and provide some initial performance measurements that give an indication of the effectiveness of this new technique.},
 booktitle = {33rd Symposium on Implementation and Application of Functional Languages},
 pages = {29–41},
 numpages = {13},
 keywords = {reference counting, compiler optimisation, memory management, SaC},
 location = {Nijmegen, Netherlands},
 series = {IFL '21} 
}

Memory management plays a key role when trying to compile functional programs into efficiently executable code. In particular when using flat representations for multi-dimensional arrays, i.e., when using a single memory block for the entire data of a multi-dimensional array, in-place updates become crucial for highly competitive performance. This paper proposes a novel code generation technique for performing fold-operations on hyper-planes of multi-dimensional arrays, where the fold-operation itself operates on non-scalar sub-arrays, i.e., on vectors or higher-dimensional arrays. This technique allows for a single result array allocation over the entire folding operation without requiring the folding operation itself to be scalarised. It enables the utilisation of vector operations without any added memory allocation or copying overhead. We describe our technique in the context of SaC, sketch our implementation in the context of the compiler sac2c and provide some initial performance measurements that give an indication of the effectiveness of this new technique.

2019

DOItensor-comprehensions-ifl19.pdf BibTexAbstract

@inproceedings{SchoShinIFL19,
 author = {Sven-Bodo Scholz and Artjoms Šinkarovs},
 title = {Tensor Comprehensions in {SaC}},
 year = {2019},
 isbn = {9781450375627},
 publisher = {ACM},
 address = {New York, NY, USA},
 booktitle = {Proceedings of the 31st Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '19},
 articleno = {15},
 numpages = {13},
 doi = {10.1145/3412932.3412947},
 url = {tensor-comprehensions-IFL19.pdf},
 summary = {Motivation,syntax, and semantics of Tensor Comprehensions in SaC.},
 abstract = {We propose a new notation for data parallel operators on multi- dimensional arrays named tensor comprehensions. This notation combines the basic principle of array-comprehensions with syn- tactical shortcuts very close to those found in the so-called Tensor Notations used in Physics and Mathematics. As a result, complex operators with rich semantics can be defined concisely. The key to this conciseness lies in the ability to define shape-polymorphic operations combined with the ability to infer array shapes from the immediate context. The paper provides a definition of the pro- posed notation, a formal shape inference process, as well as a set of re-write rules that translates tensor comprehensions as a zero-cost syntactic sugar into standard SaC expressions.},
 keywords = {Array comprehensions,
 Call by value, Array programming, Functional languages},
 category  = {core,design},
 location = {Singapore}
}

We propose a new notation for data parallel operators on multi- dimensional arrays named tensor comprehensions. This notation combines the basic principle of array-comprehensions with syn- tactical shortcuts very close to those found in the so-called Tensor Notations used in Physics and Mathematics. As a result, complex operators with rich semantics can be defined concisely. The key to this conciseness lies in the ability to define shape-polymorphic operations combined with the ability to infer array shapes from the immediate context. The paper provides a definition of the pro- posed notation, a formal shape inference process, as well as a set of re-write rules that translates tensor comprehensions as a zero-cost syntactic sugar into standard SaC expressions.

2018

DOIextended-memory-reuse-ifl18.pdf BibTexAbstract

@inproceedings{ViesSinkSchoIFL18,
author = {Vie\ss{}mann, Hans-Nikolai and \v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
title = {Extended Memory Reuse: An Optimisation for Reducing Memory Allocations},
year = {2018},
isbn = {9781450371438},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
doi = {10.1145/3310232.3310242},
summary = {Explains all details of the optimisation EMR.},
url = {extended-memory-reuse-IFL18.pdf},
abstract = {In this paper we present an optimisation for reference counting based garbage collection. The optimisation aims at reducing the total number of calls to the heap manager while preserving the key benefits of reference counting, i.e. the opportunities for in-place updates as well as memory deallocation without global garbage collection. The key idea is to carefully extend the lifetime of variables so that memory deallocations followed by memory allocations of the same size can be replaced by a direct memory reuse. Such memory reuse turns out particularly useful in the context of innermost loops of compute-intensive applications. It leads to a runtime behaviour that performs pointer swaps between buffers in the same way it would be implemented manually in languages that require explicit memory management, e.g. C.We have implemented the proposed optimisation in the context of the Single-Assignment C compiler tool chain. The paper provides an algorithmic description of our optimisation and an evaluation of its effectiveness over a collection of benchmarks including a subset of the Rodinia benchmarks and the NAS Parallel Benchmarks. We show that for several benchmarks with allocations within loops our optimisation reduces the amount of allocations by a few orders of magnitude. We also observe no negative impact on the overall memory footprint nor on the overall runtime. Instead, for some sequential executions we find mild improvement, and on GPU devices we observe speedups of up to a factor of 4x.},
booktitle = {Proceedings of the 30th Symposium on Implementation and Application of Functional Languages},
pages = {107–118},
numpages = {12},
category = {core,opts},
keywords = {reference counting, memory management, compiler optimisation},
location = {Lowell, MA, USA},
series = {IFL 2018}
}

In this paper we present an optimisation for reference counting based garbage collection. The optimisation aims at reducing the total number of calls to the heap manager while preserving the key benefits of reference counting, i.e. the opportunities for in-place updates as well as memory deallocation without global garbage collection. The key idea is to carefully extend the lifetime of variables so that memory deallocations followed by memory allocations of the same size can be replaced by a direct memory reuse. Such memory reuse turns out particularly useful in the context of innermost loops of compute-intensive applications. It leads to a runtime behaviour that performs pointer swaps between buffers in the same way it would be implemented manually in languages that require explicit memory management, e.g. C.We have implemented the proposed optimisation in the context of the Single-Assignment C compiler tool chain. The paper provides an algorithmic description of our optimisation and an evaluation of its effectiveness over a collection of benchmarks including a subset of the Rodinia benchmarks and the NAS Parallel Benchmarks. We show that for several benchmarks with allocations within loops our optimisation reduces the amount of allocations by a few orders of magnitude. We also observe no negative impact on the overall memory footprint nor on the overall runtime. Instead, for some sequential executions we find mild improvement, and on GPU devices we observe speedups of up to a factor of 4x.

2015

DOI BibTexAbstract

@InProceedings{VSSIFL2015,
 author = {Viessmann, Hans-Nikolai and Scholz, Sven-Bodo and Sinkarovs, Artjoms and Bainbridge, Brian and Hamilton, Brian and Flower, Simon},
 title = {Making Fortran Legacy Code More Functional: Using the BGS Geomagnetic Field Modelling System As an Example},
 booktitle = {Proceedings of the 27th Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '15},
 year = {2015},
 isbn = {978-1-4503-4273-5},
 location = {Koblenz, Germany},
 pages = {11:1--11:13},
 articleno = {11},
 numpages = {13},
 doi = {10.1145/2897336.2897348},
 acmid = {2897348},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {eigensystem, foreign function interface, fortran, functional programming, high-performance computing, single-assignment C},
 summary = {This paper presents the support for calling SaC from Fortran and it reports about the experiences in the context of the BGS Geomagnetic Field Modelling System code.},
 abstract  = {This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.},
 category = {core,ffi,apps,par},
}

This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.

DOI BibTexAbstract

@InProceedings{GordonScholzIFL15,
 author = {Gordon, Stuart and Scholz, Sven-Bodo},
 title = {Dynamic Adaptation of Functional Runtime Systems Through External Control},
 booktitle = {Proceedings of the 27th Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '15},
 year = {2015},
 isbn = {978-1-4503-4273-5},
 location = {Koblenz, Germany},
 pages = {10:1--10:13},
 articleno = {10},
 numpages = {13},
 doi = {10.1145/2897336.2897347},
 acmid = {2897347},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {SAC, array programming, compilers, dynamic adaptation, functional programming, high performance computing, high-level parallel programming, over-subscription},
 category = {core, opt},
 abstract = {In this paper, we present a novel approach towards providing compiler generated runtime means for dynamic adaptation. The key novelty of the proposed solution is a complete separation between the runtime adaptation mechanism itself and the control mechanism of the adaptation. This strict separation has various benefits including experimentation with adaptation control mechanisms without modifying the runtime system itself, and opening up the possibility for control mechanisms that control several applications possibly even across different runtime systems. The paper does not only present the basic principles of the approach taken, it also describes our prototypical implementation in the context of the functional array programming language SaC. The dynamic adaptation we are concerned with in the context of this paper is an adaptation in the number of computing cores utilised during parallel execution on multiple cores. We show how this adaptation mechanism can be leveraged to deal with changes in the background load generated from other applications running concurrently with the application whose parallelism we want to control.},
 summary = {This paper describes the first cut on providing a runtime interface for continuous external adaptation of SaC programs. It also demonstrates how this can be used to adapt the number of threads used within a with-loop solely based on previous execution times of the same with-loop.},
}

In this paper, we present a novel approach towards providing compiler generated runtime means for dynamic adaptation. The key novelty of the proposed solution is a complete separation between the runtime adaptation mechanism itself and the control mechanism of the adaptation. This strict separation has various benefits including experimentation with adaptation control mechanisms without modifying the runtime system itself, and opening up the possibility for control mechanisms that control several applications possibly even across different runtime systems. The paper does not only present the basic principles of the approach taken, it also describes our prototypical implementation in the context of the functional array programming language SaC. The dynamic adaptation we are concerned with in the context of this paper is an adaptation in the number of computing cores utilised during parallel execution on multiple cores. We show how this adaptation mechanism can be leveraged to deal with changes in the background load generated from other applications running concurrently with the application whose parallelism we want to control.

2014

SaC/C Formulations of the All-pair N-body Problem and Their Performance on SMPs and GPGPUs A. Šinkarovs, S.B. Scholz, R. Bernecky, R. Douma and C. Grelck (2014). Concurrency and Computation: Practice and Experience 26 (4) pp. 952–971. DOI2014_0.pdf BibTexAbstract

@Article{SinkSchoBernetalCCPE13,
  author   = {A. Šinkarovs and S.B. Scholz and R. Bernecky and R. Douma and C. Grelck},
  title    = {SaC/C Formulations of the All-pair N-body Problem and Their Performance on SMPs and GPGPUs},
  journal  = {Concurrency and Computation: Practice and Experience},
  year     = {2014},
  volume   = {26},
  number   = {4},
  pages    = {952--971},
  abstract = {This paper describes our experience in implementing the classical N-body algorithm in SaC and analysing the runtime performance achieved on three different machines: a dual-processor 8-core Dell PowerEdge 2950 a Beowulf cluster node, the reference machine, a quad-core hyper-threaded Intel Core-i7 based system equipped with an NVidia GTX-480 graphics accelerator and an Oracle Sparc T4-4 server with a total of 256 hardware threads. We contrast our findings with those resulting from the reference C code and a few variants of it that employ OpenMP pragmas as well as explicit vectorisation. Our experiments demonstrate that the SaC implementation successfully combines a high level of abstraction, very close to the mathematical specification, with very competitive runtimes. In fact, SaC matches or outperforms the hand-vectorised and hand-parallelised C codes on all three systems under investigation without the need for any source code modification. Furthermore, only SaC is able to effectively harness the advanced compute power of the graphics accelerator, again by mere recompilation of the same source code. Our results illustrate the benefits that SaC provides to application programmers in terms of coding productivity, source code, and performance portability among different machine architectures, as well as long-term maintainability in evolving hardware environments. Copyright © 2013 John Wiley & Sons, Ltd.},
  category = {core},
  doi      = {10.1002/cpe.3078},
  url      = {2014_0.pdf},
}

This paper describes our experience in implementing the classical N-body algorithm in SaC and analysing the runtime performance achieved on three different machines: a dual-processor 8-core Dell PowerEdge 2950 a Beowulf cluster node, the reference machine, a quad-core hyper-threaded Intel Core-i7 based system equipped with an NVidia GTX-480 graphics accelerator and an Oracle Sparc T4-4 server with a total of 256 hardware threads. We contrast our findings with those resulting from the reference C code and a few variants of it that employ OpenMP pragmas as well as explicit vectorisation. Our experiments demonstrate that the SaC implementation successfully combines a high level of abstraction, very close to the mathematical specification, with very competitive runtimes. In fact, SaC matches or outperforms the hand-vectorised and hand-parallelised C codes on all three systems under investigation without the need for any source code modification. Furthermore, only SaC is able to effectively harness the advanced compute power of the graphics accelerator, again by mere recompilation of the same source code. Our results illustrate the benefits that SaC provides to application programmers in terms of coding productivity, source code, and performance portability among different machine architectures, as well as long-term maintainability in evolving hardware environments. Copyright © 2013 John Wiley & Sons, Ltd.

Next Generation Asynchronous Adaptive Specialization for Data-parallel Functional Array Processing in Sac C. Grelck and H. Wiesinger (2014). In Implementation and Application of Functional Languages (IFL'13), 25th International Symposium, Nijmegen, Netherlands. ACM. DOI2014_1.pdf BibTexAbstract

@InProceedings{GrelWiesIFL13,
  author    = {C. Grelck and H. Wiesinger},
  title     = {Next Generation Asynchronous Adaptive Specialization for Data-parallel Functional Array Processing in Sac},
  booktitle = {Implementation and Application of Functional Languages (IFL'13), 25th International Symposium, Nijmegen, Netherlands},
  year      = {2014},
  editor    = {R. Plasmeijer and P. Achten and P. Koopman},
  publisher = {ACM},
  doi       = {10.1145/2620678.2620690},
  url       = {2014_1.pdf},
  abstract  = {Data-parallel processing of multi-dimensional functional/immutable arrays is characterized by a fundamental trade-off between software engineering principles on the one hand and runtime performance concerns on the other hand. Whereas the former demand code to be written in a generic style abstracting from structural properties of arrays as much as possible, the latter require an optimizing compiler to have as much information on the very same structural properties available at compile time. Asynchronous adaptive specialization of generic code to specific data to be processed at application runtime has proven to be an effective way to reconcile these contrarian demands. In this paper we revisit asynchronous adaptive specialization in the context of the functional data-parallel array language SaC. We provide a comprehensive analysis of its strengths and weaknesses and propose improvements for its design and implementation. These improvements are primarily concerned with making specializations available to running applications as quickly as possible. We propose four complementary measures to this effect. Bulk adaptive specialization speculatively waits for future specialization requests to materialize instead of addressing each request individually. Prioritized adaptive specialization aims at selecting the most profitable specializations first. Parallel adaptive specialization reserves multiple cores for specialization and, thus, computes multiple specializations simultaneously. Last but not least, persistent adaptive specialization preserves specializations across independent program runs and even across unrelated applications},
  category  = {core},
}

Data-parallel processing of multi-dimensional functional/immutable arrays is characterized by a fundamental trade-off between software engineering principles on the one hand and runtime performance concerns on the other hand. Whereas the former demand code to be written in a generic style abstracting from structural properties of arrays as much as possible, the latter require an optimizing compiler to have as much information on the very same structural properties available at compile time. Asynchronous adaptive specialization of generic code to specific data to be processed at application runtime has proven to be an effective way to reconcile these contrarian demands. In this paper we revisit asynchronous adaptive specialization in the context of the functional data-parallel array language SaC. We provide a comprehensive analysis of its strengths and weaknesses and propose improvements for its design and implementation. These improvements are primarily concerned with making specializations available to running applications as quickly as possible. We propose four complementary measures to this effect. Bulk adaptive specialization speculatively waits for future specialization requests to materialize instead of addressing each request individually. Prioritized adaptive specialization aims at selecting the most profitable specializations first. Parallel adaptive specialization reserves multiple cores for specialization and, thus, computes multiple specializations simultaneously. Last but not least, persistent adaptive specialization preserves specializations across independent program runs and even across unrelated applications

2013

Towards Heterogeneous Computing without Heterogeneous Programming M. Diogo and C. Grelck (2013). In Trends in Functional Programming, 13th Symposium, TFP 2012, St.Andrews, UK. pp. 279–294. Springer. DOI2013_0.pdf BibTexAbstract

@InProceedings{DiogGrelTFP12,
  author      = {M. Diogo and C. Grelck},
  title       = {Towards Heterogeneous Computing without Heterogeneous Programming},
  booktitle   = {Trends in Functional Programming, 13th Symposium, TFP 2012, St.Andrews, UK},
  year        = {2013},
  editor      = {K. Hammond and H.W. Loidl},
  volume      = {7829},
  series      = {Lecture Notes in Computer Science},
  pages       = {279--294},
  publisher   = {Springer},
  summary     = {},
  abstract    = {From laptops to supercomputer nodes hardware architectures become increasingly heterogeneous, combining at least multiple general-purpose cores with one or even multiple GPU accelerators. Taking effective advantage of such systems’ capabilities becomes increasingly important, but is even more challenging. SaC is a functional array programming language with support for fully automatic parallelization following a data-parallel approach. Typical SaC programs are good matches for both conventional multi-core processors as well as many-core accelerators. Indeed, SaC supports both architectures in an entirely compiler-directed way, but so far a choice must be made at compile time: either the compiled code utilizes multiple cores and ignores a potentially available accelerator, or it uses a single GPU while ignoring all but one core of the host system. We present a compilation scheme and corresponding runtime system support that combine both code generation alternatives to harness the computational forces of multiple general-purpose cores and multiple GPU accelerators to collaboratively execute SaC programs without explicit encoding in the programs themselves and thus without going through the hassle of heterogeneous programming.},
  category    = {core, par},
  DOI         = {10.1007/978-3-642-40447-4_18},
  URL         = {2013_0.pdf},
}

From laptops to supercomputer nodes hardware architectures become increasingly heterogeneous, combining at least multiple general-purpose cores with one or even multiple GPU accelerators. Taking effective advantage of such systems’ capabilities becomes increasingly important, but is even more challenging. SaC is a functional array programming language with support for fully automatic parallelization following a data-parallel approach. Typical SaC programs are good matches for both conventional multi-core processors as well as many-core accelerators. Indeed, SaC supports both architectures in an entirely compiler-directed way, but so far a choice must be made at compile time: either the compiled code utilizes multiple cores and ignores a potentially available accelerator, or it uses a single GPU while ignoring all but one core of the host system. We present a compilation scheme and corresponding runtime system support that combine both code generation alternatives to harness the computational forces of multiple general-purpose cores and multiple GPU accelerators to collaboratively execute SaC programs without explicit encoding in the programs themselves and thus without going through the hassle of heterogeneous programming.

2012

Single Assignment C (sac): High Productivity Meets High Performance C. Grelck (2012). In 4th Central European Functional Programming Summer School (CEFP'11), Budapest, Hungary. pp. 207–278. Springer. DOI2012_1.pdf BibTexAbstract

@InProceedings{GrelckCEFP11,
  author    = {C. Grelck},
  title     = {Single Assignment C (sac): High Productivity Meets High Performance},
  booktitle = {4th Central European Functional Programming Summer School (CEFP'11), Budapest, Hungary},
  year      = {2012},
  editor    = {V. Zs\'ok and Z. Horv\'ath and R. Plasmeijer},
  volume    = {7241},
  series    = {Lecture Notes in Computer Science},
  pages     = {207--278},
  publisher = {Springer},
  abstract  = {We present the ins and outs of the purely functional, data parallel programming language SaC (Single Assignment C). SaC defines state- and side-effect-free semantics on top of a syntax resembling that of imperative languages like C/C++/C# or Java: functional programming with curly brackets. In contrast to other functional languages data aggregation in SaC is not based on lists and trees, but puts stateless arrays into the focus. SaC implements an abstract calculus of truly multidimensional arrays that is adopted from interpreted array languages like Apl. Arrays are abstract values with certain structural properties. They are treated in a holistic way, not as loose collections of data cells or indexed memory address ranges. Programs can and should be written in a mostly index-free style. Functions consume array values as arguments and produce array values as results. The array type system of SaC allows such functions to abstract not only from the size of vectors or matrices but likewise from the number of array dimensions, supporting a highly generic programming style. The design of SaC aims at reconciling high productivity in software engineering of compute-intensive applications with high performance in program execution on modern multi- and many-core computing systems. While SaC competes with other functional and declarative languages on the productivity aspect, it competes with hand-parallelised C and Fortran code on the performance aspect. We achieve our goal through stringent co-design of programming language and compilation technology.  The focus on arrays in general and the abstract view of arrays in particular combined with a functional state-free semantics are key ingredients in the design of SaC. In conjunction they allow for far-reaching program transformations and fully compiler-directed parallelisation. From literally the same source code SaC currently supports symmetric multi-socket, multi-core, hyperthreaded server systems, CUDA-enables graphics accelerators and the MicroGrid, an innovative general-purpose many-core architecture. The CEFP lecture provides an introduction into the language design of SaC, followed by an illustration of how these concepts can be harnessed to write highly abstract, reusable and elegant code. We conclude with outlining the major compiler technologies for achieving runtime performance levels that are competitive with low-level machine-oriented programming environments.},
  category  = {design,core},
  doi       = {10.1007/978-3-642-32096-5_5},
  topics    = {SAC},
  url       = {2012_1.pdf},
}

We present the ins and outs of the purely functional, data parallel programming language SaC (Single Assignment C). SaC defines state- and side-effect-free semantics on top of a syntax resembling that of imperative languages like C/C++/C or Java: functional programming with curly brackets. In contrast to other functional languages data aggregation in SaC is not based on lists and trees, but puts stateless arrays into the focus. SaC implements an abstract calculus of truly multidimensional arrays that is adopted from interpreted array languages like Apl. Arrays are abstract values with certain structural properties. They are treated in a holistic way, not as loose collections of data cells or indexed memory address ranges. Programs can and should be written in a mostly index-free style. Functions consume array values as arguments and produce array values as results. The array type system of SaC allows such functions to abstract not only from the size of vectors or matrices but likewise from the number of array dimensions, supporting a highly generic programming style. The design of SaC aims at reconciling high productivity in software engineering of compute-intensive applications with high performance in program execution on modern multi- and many-core computing systems. While SaC competes with other functional and declarative languages on the productivity aspect, it competes with hand-parallelised C and Fortran code on the performance aspect. We achieve our goal through stringent co-design of programming language and compilation technology. The focus on arrays in general and the abstract view of arrays in particular combined with a functional state-free semantics are key ingredients in the design of SaC. In conjunction they allow for far-reaching program transformations and fully compiler-directed parallelisation. From literally the same source code SaC currently supports symmetric multi-socket, multi-core, hyperthreaded server systems, CUDA-enables graphics accelerators and the MicroGrid, an innovative general-purpose many-core architecture. The CEFP lecture provides an introduction into the language design of SaC, followed by an illustration of how these concepts can be harnessed to write highly abstract, reusable and elegant code. We conclude with outlining the major compiler technologies for achieving runtime performance levels that are competitive with low-level machine-oriented programming environments.

User-defined Shape Constraints in Sac F. Tang and C. Grelck (2012). In 24th International Symposium on Implementation and Application of Functional Languages (IFL'12), Oxford, UK. University of Oxford. 2012_0.pdf BibTexAbstract

@InProceedings{TangGrelIFL12,
  author    = {F. Tang and C. Grelck},
  title     = {User-defined Shape Constraints in Sac},
  booktitle = {24th International Symposium on Implementation and Application of Functional Languages (IFL'12), Oxford, UK},
  year      = {2012},
  editor    = {R. Hinze},
  publisher = {University of Oxford},
  abstract  = {We propose a method called user-defined constraints specifically for shape-generic multi-dimensional array programming. Our proposed technique allows programmers to make implicit constraints in the domain and codomain of functions explicit. This method can help compilers to generate more reliable code, improve performance through better optimization and improve software documentation. We propose and motivate a syntax extension for the functional array language SAC and describe steps to systematically transform source-level constraints into existing intermediate code representations. We discuss ways of statically resolving constraints through aggressive partial evaluation and propose some form of syntactic sugar that blurs the line between user-defined constraints and fully-fledged dependent types.},
  category  = {core},
  url       = {2012_0.pdf},
}

We propose a method called user-defined constraints specifically for shape-generic multi-dimensional array programming. Our proposed technique allows programmers to make implicit constraints in the domain and codomain of functions explicit. This method can help compilers to generate more reliable code, improve performance through better optimization and improve software documentation. We propose and motivate a syntax extension for the functional array language SAC and describe steps to systematically transform source-level constraints into existing intermediate code representations. We discuss ways of statically resolving constraints through aggressive partial evaluation and propose some form of syntactic sugar that blurs the line between user-defined constraints and fully-fledged dependent types.

SaC on a Niagara T3-4 Server: Lessons and Experiences C. Grelck and R. Douma (2012). In Applications, Tools and Techniques on the Road to Exascale Computing. pp. 289–296. IOS Press, Amsterdam. DOI BibTexAbstract

@InCollection{GrelDoumParCo11,
  author    = {C. Grelck and R. Douma},
  title     = {SaC on a Niagara T3-4 Server: Lessons and Experiences},
  booktitle = {Applications, Tools and Techniques on the Road to Exascale Computing},
  publisher = {IOS Press, Amsterdam},
  year      = {2012},
  editor    = {K. {de Bosschere} and E.H. {D'Hollander} and G.R. Joubert and D. Padua and F. Peters and M. Sawyer},
  volume    = {22},
  series    = {Advances in Parallel Computing},
  pages     = {289--296},
  abstract  = {The Sparc T3-4 server provides up to 512 concurrent hardware threads, a degree of concurrency that is unprecedented in a single server system. This paper reports on how the automatically parallelising compiler of the data-parallel functional array language SAC copes with up to 512 execution units. We investigate three different numerical kernels that are representative for a wide range of applications: matrix multiplication, convolution and 3-dimensional FFT. We show both the high-level declarative coding style of SAC and the performance achieved on the T3-4 server. Last not least, we draw conclusions for improving our compiler technology in the future.},
  category  = {core},
  doi       = {10.3233/978-1-61499-041-3-289},
  topics    = {SAC},
}

The Sparc T3-4 server provides up to 512 concurrent hardware threads, a degree of concurrency that is unprecedented in a single server system. This paper reports on how the automatically parallelising compiler of the data-parallel functional array language SAC copes with up to 512 execution units. We investigate three different numerical kernels that are representative for a wide range of applications: matrix multiplication, convolution and 3-dimensional FFT. We show both the high-level declarative coding style of SAC and the performance achieved on the T3-4 server. Last not least, we draw conclusions for improving our compiler technology in the future.

2011

Nested Arrays in Single Assignment C R. Douma (2011). Amsterdam, Netherlands. 2011_0.pdf BibTexAbstract

@MastersThesis{Douma11,
  author    = {R. Douma},
  title     = {Nested Arrays in Single Assignment C},
  school    = {University of Amsterdam},
  year      = {2011},
  address   = {Amsterdam, Netherlands},
  topics    = {SAC},
  summary   = {},
  abstract  = {In many languages when one talks about arrays this means rectangular arrays.  However, there are many problems that are not rectangular and it is often nonlogical to describe them  using  rectangular  arrays.   We  will  call  these  non  rectangular  arrays: irregular arrays. In  this  thesis  we  introduce  an  implementation  for  irregular  arrays,  called  nested arrays,  in  Single  Assignment  C  (or SaC for  short).   We  discuss  the  design  space  of irregular arrays and extend the SaC language, and the SaC compiler to support irregular arrays. As a result we show that for SaC programs that have irregular data the use of nested arrays can lead to a signi cant reduction in memory requirements,  while at the same time providing substantial speedup.},
  category  = {core, design},
  DOI       = {},
  URL       = {2011_0.pdf},
}

In many languages when one talks about arrays this means rectangular arrays. However, there are many problems that are not rectangular and it is often nonlogical to describe them using rectangular arrays. We will call these non rectangular arrays: irregular arrays. In this thesis we introduce an implementation for irregular arrays, called nested arrays, in Single Assignment C (or SaC for short). We discuss the design space of irregular arrays and extend the SaC language, and the SaC compiler to support irregular arrays. As a result we show that for SaC programs that have irregular data the use of nested arrays can lead to a signi cant reduction in memory requirements, while at the same time providing substantial speedup.

Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler Jing Guo, Jeyarajan Thiyagalingam and Sven-Bodo Scholz (2011). In 6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA. pp. 15–24. ACM Press. DOI2011_5.pdf BibTexAbstract

@InProceedings{GuoJeyaSchoDAMP11,
  author     = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
  title      = {Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler},
  booktitle  = {6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA},
  year       = {2011},
  pages      = {15--24},
  publisher  = {ACM Press},
  abstract   = {Over recent years, the use of Graphics Processing Units (GPUs) for general-purpose computing has become increasingly popular. The main reasons for this development are the attractive performance/price and performance/power ratios of these architectures. However, substantial performance gains from GPUs come at a price: they require extensive programming expertise and, typically, a substantial re-coding effort. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices. Directive-based approaches such as hiCUDA or OpenMP-variants offer further improvements but have not eliminated the need for the expertise on these complex architectures. Similarly, special purpose programming languages such as Microsoft's Accelerator try to lower the barrier further. They provide the programmer with a special form of GPU data structures and operations on them which are then compiled into GPU code. In this paper, we take this trend towards a completely implicit, high-level approach yet another step further. We generate CUDA code from a MATLAB-like high level functional array programming language, Single Assignment C (SaC). To do so, we identify which data structures and operations can be successfully mapped on GPUs and transform existing programs accordingly. This paper presents the first runtime results from our GPU backend and it presents the basic set of GPU-specific program optimisations that turned out to be essential. Despite our high-level program specifications, we show that for a number of benchmarks speedups between a factor of 5 and 50 can be achieved through our parallelising compiler.},
  affil      = {ctca},
  category   = {par,core,opt},
  doi        = {10.1145/1926354.1926359},
  pubaddress = {New York, NY, USA},
  url        = {2011_5.pdf},
}

Over recent years, the use of Graphics Processing Units (GPUs) for general-purpose computing has become increasingly popular. The main reasons for this development are the attractive performance/price and performance/power ratios of these architectures. However, substantial performance gains from GPUs come at a price: they require extensive programming expertise and, typically, a substantial re-coding effort. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices. Directive-based approaches such as hiCUDA or OpenMP-variants offer further improvements but have not eliminated the need for the expertise on these complex architectures. Similarly, special purpose programming languages such as Microsoft's Accelerator try to lower the barrier further. They provide the programmer with a special form of GPU data structures and operations on them which are then compiled into GPU code. In this paper, we take this trend towards a completely implicit, high-level approach yet another step further. We generate CUDA code from a MATLAB-like high level functional array programming language, Single Assignment C (SaC). To do so, we identify which data structures and operations can be successfully mapped on GPUs and transform existing programs accordingly. This paper presents the first runtime results from our GPU backend and it presents the basic set of GPU-specific program optimisations that turned out to be essential. Despite our high-level program specifications, we show that for a number of benchmarks speedups between a factor of 5 and 50 can be achieved through our parallelising compiler.

Using Openmp As an Alternative Parallelization Strategy in Sac Zheng Zhangzheng (2011). Amsterdam, Netherlands. 2011_6.pdf BibTex

@MastersThesis{Zhangzheng11,
  author  = {Zheng Zhangzheng},
  title   = {Using Openmp As an Alternative Parallelization Strategy in Sac},
  school  = {University of Amsterdam},
  year    = {2011},
  category = {par, core},
  address = {Amsterdam, Netherlands},
  url     = {2011_6.pdf},
}

2010

Compiler-support for Robust Multi-core Computing Raimund Kirner, Stephan Herhut and Sven-Bodo Scholz (2010). In 4th International Symposium on Leveraging Applications of Formal Methods, Verification and Validation. Springer Verlag. DOI2010_1.pdf BibTexAbstract

@InProceedings{KirnHerhScho10,
  author     = {Raimund Kirner and Stephan Herhut and Sven-Bodo Scholz},
  title      = {Compiler-support for Robust Multi-core Computing},
  booktitle  = {4th International Symposium on Leveraging Applications of Formal Methods, Verification and Validation},
  year       = {2010},
  publisher  = {Springer Verlag},
  abstract   = {Embedded computing is characterised by the limited availability of computing resources. Further, embedded systems are often used in safety-critical applications with real-time constraints. Thus, the software development has to follow rigorous procedures to minimise the risk of system failures. However, besides the inherent application complexities, there is also an increased technology-based complexity due to the shift to concurrent programming of multi-core systems. For such systems it is quite challenging to develop safe and resource-efficient systems. In this paper we give a plea for the need of better software development tools to cope with this challenge. For example, we outline how compilers can help to simplify the writing of fault-tolerant and robust software, which keeps the application code more compact, comprehensive, and maintainable. We take a rather extreme stand by promoting a functional programming approach. This functional programming paradigm reduces the complexity of program analysis and thus allows for more efficient and powerful techniques. We will implement an almost transparent support for robustness within the SaC research compiler, which accepts a C-like functional program as input. Compared to conventional approaches in the field of automatic software-controlled resilience, our functional setting will allow for lower overhead, making the approach interesting for embedded computing as well as for high-performance computing.},
  affil      = {ctca},
  category   = {core,par},
  doi        = {10.1007/978-3-642-16558-0_6},
  location   = {Amirandes, Crete},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC, robustness, compiler-based robustness, fault-tolerance},
  url        = {2010_1.pdf},
}

Embedded computing is characterised by the limited availability of computing resources. Further, embedded systems are often used in safety-critical applications with real-time constraints. Thus, the software development has to follow rigorous procedures to minimise the risk of system failures. However, besides the inherent application complexities, there is also an increased technology-based complexity due to the shift to concurrent programming of multi-core systems. For such systems it is quite challenging to develop safe and resource-efficient systems. In this paper we give a plea for the need of better software development tools to cope with this challenge. For example, we outline how compilers can help to simplify the writing of fault-tolerant and robust software, which keeps the application code more compact, comprehensive, and maintainable. We take a rather extreme stand by promoting a functional programming approach. This functional programming paradigm reduces the complexity of program analysis and thus allows for more efficient and powerful techniques. We will implement an almost transparent support for robustness within the SaC research compiler, which accepts a C-like functional program as input. Compared to conventional approaches in the field of automatic software-controlled resilience, our functional setting will allow for lower overhead, making the approach interesting for embedded computing as well as for high-performance computing.

Compilation of Modelica Array Computations into Single Assignment C for Efficient Execution on Cuda-enabled Gpu Kristian Stavåker, Daniel Rolls, Jing Guo, Peter Fritzson and Sven-Bodo Scholz (oct 2010). In 3rd International Workshop on Equation-Based Object-Oriented Languages and Tools, Oslo, Norway. 2010_2.pdf BibTexAbstract

@InProceedings{RollSchoJoslStavetalEOOLT10,
  author    = {Kristian Stavåker and Daniel Rolls and Jing Guo and Peter Fritzson and Sven-Bodo Scholz},
  title     = {Compilation of Modelica Array Computations into Single Assignment C for Efficient Execution on Cuda-enabled Gpu},
  booktitle = {3rd International Workshop on Equation-Based Object-Oriented Languages and Tools, Oslo, Norway},
  year      = {2010},
  month     = oct,
  abstract  = {Mathematical models, derived for example from discretisation of partial differential equations, often contain operations over large arrays. In this work we investigate the possibility of compiling array operations from models in the equation-based language Modelica into Single Assignment C (SAC). The SAC2C SAC compiler can generate highly efficient code that, for instance, can be executed on CUDAenabled GPUs. We plan to enhance the open-source Modelica compiler OpenModelica, with capabilities to detect and compile data parallel Modelica for-equations/arrayequations into SAC WITH-loops. As a first step we demonstrate the feasibility of this approach by manually inserting calls to SAC array operations in the code generated from OpenModelica and show how capabilities and runtimes can be extended. As a second step we demostrate the feasibility of rewriting parts of the OpenModelica simulation runtime system in SAC. Finally, we discuss SAC2C019s switchable target architectures and demonstrate one by harnessing a CUDA-enabled GPU to improve runtimes. To the best of our knowledge, compilation of Modelica array operations for execution on CUDA-enabled GPUs is a new research area.},
  affil     = {ctca},
  category  = {core,opt},
  topics    = {SAC, cuda, Modelica, Openmodelica},
  url       = {2010_2.pdf},
}

Mathematical models, derived for example from discretisation of partial differential equations, often contain operations over large arrays. In this work we investigate the possibility of compiling array operations from models in the equation-based language Modelica into Single Assignment C (SAC). The SAC2C SAC compiler can generate highly efficient code that, for instance, can be executed on CUDAenabled GPUs. We plan to enhance the open-source Modelica compiler OpenModelica, with capabilities to detect and compile data parallel Modelica for-equations/arrayequations into SAC WITH-loops. As a first step we demonstrate the feasibility of this approach by manually inserting calls to SAC array operations in the code generated from OpenModelica and show how capabilities and runtimes can be extended. As a second step we demostrate the feasibility of rewriting parts of the OpenModelica simulation runtime system in SAC. Finally, we discuss SAC2C019s switchable target architectures and demonstrate one by harnessing a CUDA-enabled GPU to improve runtimes. To the best of our knowledge, compilation of Modelica array operations for execution on CUDA-enabled GPUs is a new research area.

An Adaptive Compilation Framework for Generic Data-parallel Array Programming Clemens Grelck, Tim van Deurzen, Stephan Herhut and Sven-Bodo Scholz (2010). In 15th Workshop on Compilers for Parallel Computing (CPC'10). Vienna University of Technology, Vienna, Austria. 2010_3.pdf BibTexAbstract

@InProceedings{GrelDeurHerhetalCPC10,
  author    = {Clemens Grelck and Tim van Deurzen and Stephan Herhut and Sven-Bodo Scholz},
  title     = {An Adaptive Compilation Framework for Generic Data-parallel Array Programming},
  booktitle = {15th Workshop on Compilers for Parallel Computing (CPC'10)},
  year      = {2010},
  publisher = {Vienna University of Technology, Vienna, Austria},
  abstract  = {Generic array programming abstracts from structural prop- erties  of  arrays,  such  as  rank  (number  of  axes/dimensions)  and  shape (number of element along each axis/dimension). This allows for abstract program  speci cations  and,  as  such,  is  desirable  from  a  software  engi- neering  perspective.  However,  generic  programming  in  this  sense  does have an adverse e ect on runtime performance, at least when executed naively. Static compiler analyses and transformations aim at reconciling software engineering desires for generic code with runtime performance requirements.  However,  they  are  bound  to  fail  whenever  the  required information is not available until runtime. We propose a compilation framework that overcomes the inherent limi- tations of static analysis by incrementally adapting a running program to the structural properties of the arrays it operates on. This is achieved by partial recompilation of code at runtime, when all structural proper- ties of arrays are known, and dynamic relinking of the running program with dynamically generated code. We sketch out the general compilation framework architecture and provide some details on implementation issues.},
  affil     = {ctca},
  category  = {par,core},
  topics    = {SAC},
  url       = {2010_3.pdf},
}

Generic array programming abstracts from structural prop- erties of arrays, such as rank (number of axes/dimensions) and shape (number of element along each axis/dimension). This allows for abstract program speci cations and, as such, is desirable from a software engi- neering perspective. However, generic programming in this sense does have an adverse e ect on runtime performance, at least when executed naively. Static compiler analyses and transformations aim at reconciling software engineering desires for generic code with runtime performance requirements. However, they are bound to fail whenever the required information is not available until runtime. We propose a compilation framework that overcomes the inherent limi- tations of static analysis by incrementally adapting a running program to the structural properties of the arrays it operates on. This is achieved by partial recompilation of code at runtime, when all structural proper- ties of arrays are known, and dynamic relinking of the running program with dynamically generated code. We sketch out the general compilation framework architecture and provide some details on implementation issues.

Thread-local Stacks, a Light-weight Alternative to Thread-local Heaps Stephan Herhut, Carl Joslin and Sven-Bodo Scholz (2010). In 15th Workshop on Compilers for Parallel Computing (CPC'10). Vienna University of Technology, Vienna, Austria. BibTex

@InProceedings{HerhJoslSchoCPC10,
  author    = {Stephan Herhut and Carl Joslin and Sven-Bodo Scholz},
  title     = {Thread-local Stacks, a Light-weight Alternative to Thread-local Heaps},
  booktitle = {15th Workshop on Compilers for Parallel Computing (CPC'10)},
  year      = {2010},
  publisher = {Vienna University of Technology, Vienna, Austria},
  affil     = {ctca},
  category  = {par,core},
  topics    = {SAC},
}

Single Assignment C Tutorial. Ppopp 2010, Bangalore, India Sven-Bodo Scholz, Stephan Herhut, Clemens Grelck and Frank Penczek (2010). School of Computer Science, University of Hertfordshire. BibTex

@TechReport{SchoHerhGreletal10,
  author      = {Scholz, Sven-Bodo and Herhut, Stephan and Grelck, Clemens and Penczek, Frank},
  title       = {Single Assignment C Tutorial. Ppopp 2010, Bangalore, India},
  institution = {School of Computer Science, University of Hertfordshire},
  year        = {2010},
  number      = {498},
  affil       = {ctca},
  summary     = {},
  abstract    = {},
  category    = {core},
  DOI         = {},
  URL         = {},
}

2009

Truly Nested Data-parallelism: Compiling Sac to the Microgrid Architecture Stephan Herhut, Carl Joslin, Sven-Bodo Scholz and Clemens Grelck (2009). In 21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA. Seton Hall University. 2009_0.pdf BibTexAbstract

@InProceedings{HerhJoslSchoetalIFL09draft,
  author     = {Stephan Herhut and Carl Joslin and Sven-Bodo Scholz and Clemens Grelck},
  title      = {Truly Nested Data-parallelism: Compiling Sac to the Microgrid Architecture},
  booktitle  = {21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA},
  year       = {2009},
  number     = {SHU-TR-CS-2009-09-1},
  publisher  = {Seton Hall University},
  abstract   = {Data-parallel  programming  facilitates  elegant  specification of concurrency. However, the composability of data-parallel operations so far has been constrained by the requirement to have only  at data- parallel operation at runtime. In this paper, we present early results on our work to exploit hardware support for nested concurrency to directly map nested data-parallel operations in high-level specifications to low-level codes that can be efficiently executed. To this effect, we have devised a compilation scheme from data-parallel operations in SaC to the systems programming language of the Microgrid architecture. Furthermore, we present early empirical results to assert the viability of our approach.},
  affil      = {ctca},
  category   = {par,core},
  pubaddress = {South Orange, NJ, USA},
  url        = {2009_0.pdf},
}

Data-parallel programming facilitates elegant specification of concurrency. However, the composability of data-parallel operations so far has been constrained by the requirement to have only at data- parallel operation at runtime. In this paper, we present early results on our work to exploit hardware support for nested concurrency to directly map nested data-parallel operations in high-level specifications to low-level codes that can be efficiently executed. To this effect, we have devised a compilation scheme from data-parallel operations in SaC to the systems programming language of the Microgrid architecture. Furthermore, we present early empirical results to assert the viability of our approach.

Unibench: The Swiss Army Knife for Collaborative, Automated Benchmarking Daniel Rolls, Stephan Herhut, Carl Joslin and Sven-Bodo Scholz (2009). In 21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA. Seton Hall University, South Orange, NJ, USA.. 2009_2.pdf BibTex

@TechReport{RollHerhJosletalIFL09draft,
  author    = {Daniel Rolls and Stephan Herhut and Carl Joslin and Sven-Bodo Scholz},
  title     = {Unibench: The Swiss Army Knife for Collaborative, Automated Benchmarking},
  year      = {2009},
  number    = {{SHU-TR-CS-2009-09-1}},
  affil     = {ctca},
  booktitle = {21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA},
  publisher = {Seton Hall University, South Orange, NJ, USA.},
  summary   = {},
  abstract  = {},
  category  = {core},
  DOI       = {},
  URL       = {2009_2.pdf},
}

Compiling the Functional Data-parallel Language Sac for Microgrids of Self-adaptive Virtual Processors Clemens Grelck, Stephan Herhut, Chris Jesshope, Carl Joslin, Mike Lankamp, Sven-Bodo Scholz and Alex Shafarenko (2009). In 14th Workshop on Compilers for Parallel Computing (CPC'09), IBM Research Center, Zürich, Switzerland. 2009_4.pdf BibTexAbstract

@InProceedings{GrelJessJosletalCPC09,
  author    = {Clemens Grelck and Stephan Herhut and Chris Jesshope and Carl Joslin and Mike Lankamp and Sven-Bodo Scholz and Alex Shafarenko},
  title     = {Compiling the Functional Data-parallel Language Sac for Microgrids of Self-adaptive Virtual Processors},
  booktitle = {14th Workshop on Compilers for Parallel Computing (CPC'09), IBM Research Center, Z\"urich, Switzerland},
  year      = {2009},
  abstract  = {We present preliminary results from compiling the high- level, functional and data-parallel programming language SaC into a novel multi-core design: Microgrids of Self- Adaptive Virtual Processors (SVPs). The side-effect free nature of SaC in conjunction with its data-parallel foundation make it an ideal candidate for auto-parallelisation. Notwithstanding these favourable pre-conditions, scheduling and data-placement pose major challenges for effective parallelisation of irregular applications because fine-grained dynamic approaches in inflict large overheads on conventional architectures. The Microgrid architecture promises a radical shift: thread creation and context switches are implemented in hardware and cause negligible overhead. Likewise, scheduling of tasks to computing resources is catered for by hardware. This paper investigates aspects of the Microgrid architecture which may influence the overall performance of compiled data-parallel programs. In particular, we look at alternative thread creation schemes for n - dimensional, data-parallel operations and their effect on overall performance. Furthermore, we investigate the architecture's capability to cope with workload imbalances within such operations. The paper presents a sequence of experiments on a cycle-accurate emulator of the Microgrid architecture from which we derive some guiding principles for an effective compilation of data parallel operations. Based on these principles, we present a compilation scheme for the data-parallel core of SaC.},
  affil     = {ctca},
  category  = {par,design,core},
  topics    = {SAC},
  url       = {2009_4.pdf},
}

We present preliminary results from compiling the high- level, functional and data-parallel programming language SaC into a novel multi-core design: Microgrids of Self- Adaptive Virtual Processors (SVPs). The side-effect free nature of SaC in conjunction with its data-parallel foundation make it an ideal candidate for auto-parallelisation. Notwithstanding these favourable pre-conditions, scheduling and data-placement pose major challenges for effective parallelisation of irregular applications because fine-grained dynamic approaches in inflict large overheads on conventional architectures. The Microgrid architecture promises a radical shift: thread creation and context switches are implemented in hardware and cause negligible overhead. Likewise, scheduling of tasks to computing resources is catered for by hardware. This paper investigates aspects of the Microgrid architecture which may influence the overall performance of compiled data-parallel programs. In particular, we look at alternative thread creation schemes for n - dimensional, data-parallel operations and their effect on overall performance. Furthermore, we investigate the architecture's capability to cope with workload imbalances within such operations. The paper presents a sequence of experiments on a cycle-accurate emulator of the Microgrid architecture from which we derive some guiding principles for an effective compilation of data parallel operations. Based on these principles, we present a compilation scheme for the data-parallel core of SaC.

Towards Compiling Sac to Cuda Jing Guo, Jeyarajan Thiyagalingam and Sven-Bodo Scholz (2009). In 10th Symposium on Trends in Functional Programming (TFP'09). pp. 33–49. Intellect. BibTex

@InProceedings{GuoThiySchoTFP09,
  author    = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
  title     = {Towards Compiling Sac to Cuda},
  booktitle = {10th Symposium on Trends in Functional Programming (TFP'09)},
  year      = {2009},
  editor    = {Zoltan {Horváth} and {Viktória Zsók}},
  pages     = {33--49},
  publisher = {Intellect},
  affil     = {ctca},
  category  = {core},
  topics    = {SAC,CUDA},
}

2008

Efficient Heap Management for Declarative Data Parallel Programming on Multicores Clemens Grelck and Sven-Bodo Scholz (2008). In 3rd Workshop on Declarative Aspects of Multicore Programming (DAMP'08), San Francisco, CA, USA. pp. 17–31. ACM Press. 2008_1.pdf BibTexAbstract

@InProceedings{GrelSchoDAMP08,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Efficient Heap Management for Declarative Data Parallel Programming on Multicores},
  booktitle  = {3rd Workshop on Declarative Aspects of Multicore Programming (DAMP'08), San Francisco, CA, USA},
  year       = {2008},
  editor     = {Manuel Hermenegildo and Leaf Peterson and Neal Glew},
  pages      = {17--31},
  publisher  = {ACM Press},
  affil      = {ctca},
  pubaddress = {New York, NY, USA},
  topics     = {SAC},
  summary    = {},
  abstract   = {Declarative data parallel programming for shared memory multiprocessor systems implies paradigm-specific demands on the organisation of memory management. As a key feature of declarative programming implicit memory management is indispensable. Yet, the memory objects to be managed are very different from those that are predominant in general-purpose functional or object-oriented languages. Rather than complex structures of relatively small items interconnected by references, we are faced with large chunks of memory, usually arrays, which often account for 100s of MB each. Such sizes make relocation of data or failure to update arrays in-place prohibitively expensive. To address these challenges of the data parallel setting, the functional array language SaC employs continuous garbage collection via reference counting combined with several aggressive optimisation techniques. How- ever, we have observed that overall memory performance does not only rely on efficient reference counting techniques, but to a similar extent on the underlying memory allocation strategy. As in the general memory management setting we can identify specific demands of the declarative data parallel setting on heap organisation. In this paper, we propose a heap manager design tailor-made to the needs of concurrent executions of declarative data parallel programs whose memory management is based on reference counting. We present run- time measurements that quantify the impact of the proposed design and relate it to the performance of several different general purpose heap managers that are available in the public domain.},
  category   = {par, core},
  DOI        = {},
  URL        = {2008_1.pdf},
}

Declarative data parallel programming for shared memory multiprocessor systems implies paradigm-specific demands on the organisation of memory management. As a key feature of declarative programming implicit memory management is indispensable. Yet, the memory objects to be managed are very different from those that are predominant in general-purpose functional or object-oriented languages. Rather than complex structures of relatively small items interconnected by references, we are faced with large chunks of memory, usually arrays, which often account for 100s of MB each. Such sizes make relocation of data or failure to update arrays in-place prohibitively expensive. To address these challenges of the data parallel setting, the functional array language SaC employs continuous garbage collection via reference counting combined with several aggressive optimisation techniques. How- ever, we have observed that overall memory performance does not only rely on efficient reference counting techniques, but to a similar extent on the underlying memory allocation strategy. As in the general memory management setting we can identify specific demands of the declarative data parallel setting on heap organisation. In this paper, we propose a heap manager design tailor-made to the needs of concurrent executions of declarative data parallel programs whose memory management is based on reference counting. We present run- time measurements that quantify the impact of the proposed design and relate it to the performance of several different general purpose heap managers that are available in the public domain.

A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o Clemens Grelck, Steffen Kuthe and Sven-Bodo Scholz (2008). Parallel Processing Letters 18 (1) pp. 23–37. BibTexAbstract

@Article{GrelKuthSchoPPL08,
  author   = {Clemens Grelck and Steffen Kuthe and Sven-Bodo Scholz},
  title    = {A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o},
  journal  = {Parallel Processing Letters},
  year     = {2008},
  volume   = {18},
  number   = {1},
  pages    = {23--37},
  abstract = {We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.},
  affil    = {ctca},
  category = {par,core},
  topics   = {SAC},
}

We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.

2007

On Optimising Shape-generic Array Programs Using Symbolic Structural Information Kai Trojahner, Clemens Grelck and Sven-Bodo Scholz (2007). In Implementation and Application of Functional Languages, 18th International Symposium (IFL'06), Budapest, Hungary, Revised Selected Papers. pp. 1–18. Springer. DOIoosgapussi.pdf BibTexAbstract

@InProceedings{TrojGrelSchoIFL06,
  author     = {Kai Trojahner and Clemens Grelck and Sven-Bodo Scholz},
  title      = {On Optimising Shape-generic Array Programs Using Symbolic Structural Information},
  booktitle  = {Implementation and Application of Functional Languages, 18th International Symposium (IFL'06), Budapest, Hungary, Revised Selected Papers},
  year       = {2007},
  editor     = {Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
  volume     = {4449},
  series     = {Lecture Notes in Computer Science},
  pages      = {1--18},
  publisher  = {Springer},
  abstract   = { Shape-generic programming and high run time performance do match if generic source code is systematically specialised into nongeneric executable code. However, as soon as we drop the assumption of whole-world knowledge or refrain from specialisation for other reasons, compiled generic code is substantially less efficient. Limited effectiveness of code optimisation techniques due to the inherent lack of knowledge about the structural properties of arrays can be identified as the single most important source of inefficiency. However, in many cases partial structural information or structural relationships between arrays would actually suffice for optimisation. We propose symbolic array attributes as a uniform scheme to infer and to represent partial and relational structural information in shape-generic array code. By reusing the regular language to express structural properties in intermediate code, existing optimisations benefit from symbolic array attributes with little or no alteration. In fact, program optimisation and identification of structural properties cross-fertilise each other. We outline our approach in the context of the functional array language SaC and demonstrate its effectiveness by a small case study.},
  affil      = {ctca},
  category   = {core, opt},
  doi        = {10.1007/978-3-540-74130-5_},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  url        = {oosgapussi.pdf},
}

Shape-generic programming and high run time performance do match if generic source code is systematically specialised into nongeneric executable code. However, as soon as we drop the assumption of whole-world knowledge or refrain from specialisation for other reasons, compiled generic code is substantially less efficient. Limited effectiveness of code optimisation techniques due to the inherent lack of knowledge about the structural properties of arrays can be identified as the single most important source of inefficiency. However, in many cases partial structural information or structural relationships between arrays would actually suffice for optimisation. We propose symbolic array attributes as a uniform scheme to infer and to represent partial and relational structural information in shape-generic array code. By reusing the regular language to express structural properties in intermediate code, existing optimisations benefit from symbolic array attributes with little or no alteration. In fact, program optimisation and identification of structural properties cross-fertilise each other. We outline our approach in the context of the functional array language SaC and demonstrate its effectiveness by a small case study.

2006

Sac: A Functional Array Language for Efficient Multithreaded Execution Clemens Grelck and Sven-Bodo Scholz (2006). International Journal of Parallel Programming 34 (4) pp. 383–427. DOIsafalfeme.pdf BibTexAbstract

@Article{GrelSchoIJPP06,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Sac: A Functional Array Language for Efficient Multithreaded Execution},
  journal   = {International Journal of Parallel Programming},
  year      = {2006},
  volume    = {34},
  number    = {4},
  pages     = {383--427},
  abstract  = {We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.},
  affil     = {ctca},
  category  = {core, design},
  contents  = {[ISSN: 0885-7458 (Paper) 1573-7640 (Online)]},
  doi       = {10.1007/s10766-006-0018-x},
  sourceurl = {http://dx.doi.org/10.1007/s10766-006-0018-x},
  topics    = {SAC},
  url       = {safalfeme.pdf},
}

We give an in-depth introduction to the design of our functional array programming language SaC, the main aspects of its compilation into host machine code, and its parallelisation based on multi-threading. The language design of SaC aims at combining high-level, compositional array programming with fully automatic resource management for highly productive code development and maintenance. We outline the compilation process that maps SaC programs to computing machinery. Here, our focus is on optimisation techniques that aim at restructuring entire applications from nested compositions of general fine-grained operations into specialised coarse-grained operations. We present our implicit parallelisation technology for shared memory architectures based on multi-threading and discuss further optimisation opportunities on this level of code generation. Both optimisation and parallelisation rigorously exploit the absence of side-effects and the explicit data flow characteristic of a functional setting.

2003

Sac — from High-level Programming with Arrays to Efficient Parallel Execution Clemens Grelck and Sven-Bodo Scholz (2003). Parallel Processing Letters 13 (3) pp. 401–412. DOIsfhlpwatepe.pdf BibTexAbstract

@Article{GrelSchoPPL03,
  author   = {Clemens Grelck and Sven-Bodo Scholz},
  title    = {Sac --- from High-level Programming with Arrays to Efficient Parallel Execution},
  journal  = {Parallel Processing Letters},
  year     = {2003},
  volume   = {13},
  number   = {3},
  pages    = {401--412},
  abstract = {SAC is a purely functional array processing language designed with numerical applications in mind. It supports generic, high-level program specifications in the style of APL. However, rather than providing a fixed set of built-in array operations, SAC provides means to specify such operations in the language itself in a way that still allows their application to arrays of any rank and size. This paper illustrates the major steps in compiling generic, rank- and shape-invariant SAC specification into efficiently executable multithreaded code for parallel execution on shared memory multiprocessors. The effectiveness of the compilation techniques is demonstrated by means of a small case study on the PDE1 benchmark, which implements 3-dimensional red/black successive over-relaxation. Comparisons with HPF and ZPL show that despite the genericity of code, SAC achieves highly competitive runtime performance characteristics.},
  category = {core,design},
  doi      = {10.1142/S0129626403001379},
  topics   = {SAC},
  url      = {SFHLPWATEPE.pdf},
}

SAC is a purely functional array processing language designed with numerical applications in mind. It supports generic, high-level program specifications in the style of APL. However, rather than providing a fixed set of built-in array operations, SAC provides means to specify such operations in the language itself in a way that still allows their application to arrays of any rank and size. This paper illustrates the major steps in compiling generic, rank- and shape-invariant SAC specification into efficiently executable multithreaded code for parallel execution on shared memory multiprocessors. The effectiveness of the compilation techniques is demonstrated by means of a small case study on the PDE1 benchmark, which implements 3-dimensional red/black successive over-relaxation. Comparisons with HPF and ZPL show that despite the genericity of code, SAC achieves highly competitive runtime performance characteristics.

Single Assignment C — Efficient Support for High-level Array Operations in a Functional Setting Sven-Bodo Scholz (2003). Journal of Functional Programming 13 (6) pp. 1005–1059. DOIsacesfhlaoiafs.pdf BibTexAbstract

@Article{ScholzJFP03,
  author   = {Sven-Bodo Scholz},
  title    = {Single Assignment C --- Efficient Support for High-level Array Operations in a Functional Setting},
  journal  = {Journal of Functional Programming},
  year     = {2003},
  volume   = {13},
  number   = {6},
  pages    = {1005--1059},
  contents = {Canonical reference for SAC},
  topics   = {SAC},
  summary    = {},
  abstract   = {This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shapeinvariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC for short), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several highlevel optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.},
  category   = {core, design},
  doi        = {10.1.1.138.6995},
  url        = {SACESFHLAOIAFS.pdf},
}

This paper presents a novel approach for integrating arrays with access time O(1) into functional languages. It introduces n dimensional arrays combined with a type system that supports hierarchies of array types with varying shape information as well as a shapeinvariant form of array comprehension called with-loop. Together, these constructs allow for a programming style similar to that of array programming languages such as Apl. We use Single Assignment C (SaC for short), a functional C-variant aimed at numerical applications that is based on the proposed design, to demonstrate that programs written in that style can be compiled to code whose runtime performance is competitive with that of hand-optimized Fortran programs. However, essential prerequisites for such performance figures are a shape inference system integrated in the type system as well as several highlevel optimizations. Most notably of these is With Loop Folding, an optimization technique for eliminating intermediate arrays.

A Multithreaded Compiler Backend for High-level Array Programming Clemens Grelck (2003). In 2nd International Conference on Parallel and Distributed Computing and Networks (PDCN'03), Innsbruck, Austria. pp. 478–484. ACTA Press. DOIamcbfhlap.pdf BibTexAbstract

@InProceedings{GrelckPDCN03,
  author     = {Clemens Grelck},
  title      = {A Multithreaded Compiler Backend for High-level Array Programming},
  booktitle  = {2nd International Conference on Parallel and Distributed Computing and Networks (PDCN'03), Innsbruck, Austria},
  year       = {2003},
  editor     = {Mohammed H. Hamza},
  pages      = {478--484},
  publisher  = {ACTA Press},
  abstract   = {Whenever large homogeneous data structures need to be processed in a non-trivial way, e.g. in computational sciences, image processing, or system simulation, high-level array programming in the style of APL offers a far more concise and abstract approach than traditional scalar languages such as C/C++ or FORTRAN-77. The same sort of applications often can also be characterized as performance critical and today represents the major domain for parallel processing. This paper reports on the development of a compiler backend which allows to implicitly generate multithreaded code from high-level array program specifications. On shared memory multiprocessor systems, this code can be executed in parallel without any additional programming effort. After sketching out basic compilation schemes, optimizations on the runtime system are addressed and, finally, experimental runtime figures are presented.},
  category   = {core, par, opt},
  contents   = {[ISBN 0-88986-341-5],[ISSN 1027-2666]},
  doi        = {10.1.1.408.8553},
  pubaddress = {Anaheim, CA, USA},
  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/mt-backend-innsbruck-03.ps.gz},
  topics     = {SAC,Shared Memory},
  url        = {AMCBFHLAP.pdf},
}

Whenever large homogeneous data structures need to be processed in a non-trivial way, e.g. in computational sciences, image processing, or system simulation, high-level array programming in the style of APL offers a far more concise and abstract approach than traditional scalar languages such as C/C++ or FORTRAN-77. The same sort of applications often can also be characterized as performance critical and today represents the major domain for parallel processing. This paper reports on the development of a compiler backend which allows to implicitly generate multithreaded code from high-level array program specifications. On shared memory multiprocessor systems, this code can be executed in parallel without any additional programming effort. After sketching out basic compilation schemes, optimizations on the runtime system are addressed and, finally, experimental runtime figures are presented.

Axis Control in Sac Clemens Grelck and Sven-Bodo Scholz (2003). In Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers. pp. 182–198. Springer. DOIacis.pdf BibTexAbstract

@InProceedings{GrelSchoIFL02,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Axis Control in Sac},
  booktitle  = {Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers},
  year       = {2003},
  editor     = {Ricardo Pe{\~n}a and Thomas Arts},
  volume     = {2670},
  series     = {Lecture Notes in Computer Science},
  pages      = {182--198},
  publisher  = {Springer},
  abstract   = {High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.},
  category   = {core, opt},
  doi        = {10.1.1.540.8938},
  pubaddress = {Berlin, Heidelberg, Germany},
  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/axis-control-madrid-02.ps.gz},
  topics     = {SAC},
  url        = {ACIS.pdf},
}

High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.

2002

Implementing the Nas Benchmark Mg in Sac Clemens Grelck (2002). In 16th International Parallel and Distributed Processing Symposium (IPDPS'02), Fort Lauderdale, USA. IEEE Computer Society Press. DOIitnbmis.pdf BibTexAbstract

@InProceedings{GrelckIPDPS02,
  author    = {Clemens Grelck},
  title     = {Implementing the Nas Benchmark Mg in Sac},
  booktitle = {16th International Parallel and Distributed Processing Symposium (IPDPS'02), Fort Lauderdale, USA},
  year      = {2002},
  editor    = {Viktor K. Prasanna and George Westrom},
  publisher = {IEEE Computer Society Press},
  abstract  = {SAC is a purely functional array processing language designed with numerical applications in mind. It supports generic, high-level program specifications in the style of APL. However, rather than providing a fixed set of builtin array operations, SAC provides means to specify such operations in the language itself in a way that still allows their application to arrays of any dimension and size. This paper illustrates the specificational benefits of this approach by means of a high-level SAC implementation of the NAS benchmark MG realizing 3-dimensional multigrid relaxation with periodic boundary conditions. Despite the high-level approach, experiments show that by means of aggressive compiler optimizations SAC manages to achieve performance characteristics in the range of low-level Fortran and C implementations. For benchmark size class A, SAC is outperformed by the serial Fortran-77 reference implementation of the benchmark by only 23%, whereas SAC itself outperforms a C implementation by the same figure. Furthermore, implicit parallelization of the SAC code for shared memory multiprocessors achieves a speedup of 7.6 with 10 processors. With these figures, SAC outperforms both automatic parallelization of the serial Fortran-77 reference implementation as well as an OpenMP solution based on C code.},
  category  = {core, apt},
  doi       = {10.1.1.131.2439},
  sourceurl = {http://www.isp.mu-luebeck.de/~grelck/publications/sac-nas-mg-fortlauderdale-02.ps.gz},
  topics    = {SAC,Benchmarks,NAS},
  url       = {ITNBMIS.pdf},
}

SAC is a purely functional array processing language designed with numerical applications in mind. It supports generic, high-level program specifications in the style of APL. However, rather than providing a fixed set of builtin array operations, SAC provides means to specify such operations in the language itself in a way that still allows their application to arrays of any dimension and size. This paper illustrates the specificational benefits of this approach by means of a high-level SAC implementation of the NAS benchmark MG realizing 3-dimensional multigrid relaxation with periodic boundary conditions. Despite the high-level approach, experiments show that by means of aggressive compiler optimizations SAC manages to achieve performance characteristics in the range of low-level Fortran and C implementations. For benchmark size class A, SAC is outperformed by the serial Fortran-77 reference implementation of the benchmark by only 23%, whereas SAC itself outperforms a C implementation by the same figure. Furthermore, implicit parallelization of the SAC code for shared memory multiprocessors achieves a speedup of 7.6 with 10 processors. With these figures, SAC outperforms both automatic parallelization of the serial Fortran-77 reference implementation as well as an OpenMP solution based on C code.

A Compilation Scheme for a Hierarchy of Array Types Dietmar Kreye (2002). In Implementation of Functional Languages, 13th International Workshop (IFL'01), Stockholm, Sweden, Selected Papers. pp. 18–35. Springer. DOIacsfahoat.pdf BibTexAbstract

@InProceedings{KreyeIFL01,
  author     = {Dietmar Kreye},
  title      = {A Compilation Scheme for a Hierarchy of Array Types},
  booktitle  = {Implementation of Functional Languages, 13th International Workshop (IFL'01), Stockholm, Sweden, Selected Papers},
  year       = {2002},
  editor     = {Thomas Arts and Markus Mohnen},
  volume     = {2312},
  series     = {Lecture Notes in Computer Science},
  pages      = {18--35},
  publisher  = {Springer},
  pubaddress = {Berlin, Heidelberg, Germany},
  sourceurl  = {http://www.sac-home.org/publications/sac2c-comp-hierarchy-stockholm-01.ps.gz},
  topics     = {SAC,Implementation of Arrays},
  summary    = {},
  abstract   = {In order to achieve a high level of abstraction, array-oriented languages provide language constructs for defining array operations in a shape-invariant way. However, when trying to compile such generic array operations into efficiently executable code, static knowledge of exact shapes is essential. Therefore, modern compilers try to infer the shapes of all arrays used in a program. Unfortunately, shape inference is generally undecidable. Therefore, most compilers either rule out all programs for which shape inference fails, or they perform no shape inference at all. In the first case the expressive power of the language is restricted, in the latter the generated code has a weak runtime performance. This paper presents a new compilation scheme for the language Sac which combines these two approaches in order to avoid their individual shortcomings. A preliminary performance evaluation demonstrates the benefits of this compilation scheme.},
  category   = {core, opt},
  doi        = {10.1.1.138.8533},
  url        = {ACSFAHOAT.pdf},
}

In order to achieve a high level of abstraction, array-oriented languages provide language constructs for defining array operations in a shape-invariant way. However, when trying to compile such generic array operations into efficiently executable code, static knowledge of exact shapes is essential. Therefore, modern compilers try to infer the shapes of all arrays used in a program. Unfortunately, shape inference is generally undecidable. Therefore, most compilers either rule out all programs for which shape inference fails, or they perform no shape inference at all. In the first case the expressive power of the language is restricted, in the latter the generated code has a weak runtime performance. This paper presents a new compilation scheme for the language Sac which combines these two approaches in order to avoid their individual shortcomings. A preliminary performance evaluation demonstrates the benefits of this compilation scheme.

2001

DOIarray-data-layout-aachen-00.pdf BibTexAbstract

@InProceedings{GrelckIFL00,
  author     = {Clemens Grelck},
  title      = {Improving Cache Effectiveness through Array Data Layout in Sac},
  booktitle  = {Implementation of Functional Languages, 12th International Workshop (IFL'00), Aachen, Germany, Selected Papers},
  year       = {2001},
  editor     = {Markus Mohnen and Pieter Koopman},
  volume     = {2011},
  series     = {Lecture Notes in Computer Science},
  pages      = {231--248},
  publisher  = {Springer},
  abstract   = {SAC is a functional array processing language particularly designed with numerical applications in mind. In this field the runtime performance of programs critically depends on the efficient utilization of the memory hierarchy. Cache conflicts due to limited set associativity are one relevant source of inefficiency. This paper describes the realization of an optimization technique which aims at eliminating cache conflicts by adjusting the data layout of arrays to specific access patterns and cache configurations. Its effect on cache utilization and runtime performance is demonstrated by investigations on the PDE1 benchmark.},
  category   = {core,opt},
  doi        = {10.1007/3-540-45361-X_14},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper describes the implementation of implicit array padding as a way to resolve self-interference cache conflicts in SAC programs. Besides an improved inference heuristic, array access pattern analysis and the realization of padding as a high-level program transformation on intermediate SAC code are outlined.},
  topics     = {SAC,Array Data Layout},
  url        = {array-data-layout-aachen-00.pdf},
}

SAC is a functional array processing language particularly designed with numerical applications in mind. In this field the runtime performance of programs critically depends on the efficient utilization of the memory hierarchy. Cache conflicts due to limited set associativity are one relevant source of inefficiency. This paper describes the realization of an optimization technique which aims at eliminating cache conflicts by adjusting the data layout of arrays to specific access patterns and cache configurations. Its effect on cache utilization and runtime performance is demonstrated by investigations on the PDE1 benchmark.

2000

DOIhpf-vs-sac-muenchen-00.pdf BibTexAbstract

@InProceedings{GrelSchoEUROPAR00,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Hpf Vs. Sac --- a Case Study},
  booktitle  = {Euro-Par 2000 Parallel Processing, 6th International Euro-Par Conference (Euro-Par'00), Munich, Germany},
  year       = {2000},
  editor     = {Arndt Bode and Thomas Ludwig and Wolfgang Karl and Roland Wismüller},
  volume     = {1900},
  series     = {Lecture Notes in Computer Science},
  pages      = {620--624},
  publisher  = {Springer},
  abstract   = {This paper compares the functional programming language Sac to Hpf with respect to specificational elegance and runtime performance. A well-known benchmark, red-black SOR, serves as a case study. After presenting the Hpf reference implementation alternative Sac implementations are discussed. Eventually, performance figures show the ability to compile highly generic Sac specifications into machine code that outperforms the Hpf implementation on a shared memory multi-processor by a factor of about 3.},
  category   = {core,apps,par},
  doi        = {10.1007/3-540-44520-X_87},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper compares SAC to HPF in terms of expressiveness and execution runtimes. Red/black successive over-relaxation serves as a case study. Several SAC specifications are presented and compared with a HPF reference implementation. Performance figures for up to 10 processors of a shared memory multiprocessor are presented.},
  topics     = {SAC,HPF},
  url        = {hpf-vs-sac-muenchen-00.pdf},
}

This paper compares the functional programming language Sac to Hpf with respect to specificational elegance and runtime performance. A well-known benchmark, red-black SOR, serves as a case study. After presenting the Hpf reference implementation alternative Sac implementations are discussed. Eventually, performance figures show the ability to compile highly generic Sac specifications into machine code that outperforms the Hpf implementation on a shared memory multi-processor by a factor of about 3.

array-padding-lasvegas-00.pdf BibTexAbstract

@InProceedings{GrelckPDPTA00,
  author    = {Clemens Grelck},
  title     = {Array Padding in the Functional Language Sac},
  booktitle = {International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'00), Las Vegas, Nevada, USA},
  year      = {2000},
  editor    = {Hamid R. Arabnia},
  volume    = {5},
  pages     = {2553--2560},
  publisher = {CSREA Press, Athens, Georgia, USA},
  abstract  = {Sac is a functional array processing language that tries to combine generic, high-level program specifications with efficient runtime behavior. Being particularly designed for numerical applications, runtime performance critically depends on the effective utilization of the memory hierarchy. For many programs, however, it can be observed that the achieved performance significantly changes with small variations in the problem size. Array padding is a well-known optimization technique that adjusts the data layout of arrays in order to make better usage of caches. The paper presents an algorithm that derives a customized data layout from an array access pattern and a cache specification. Cache phenomena such as spatial and temporal reuse are taken into account as well as different cache architectures. The effectiveness is demonstrated by investigations on the runtime performance of the PDE1 benchmark on a shared memory multiprocessor.},
  category  = {core,opt,par},
  summary   = {This paper describes an inference strategy to identify suitable pad sizes for intra-array padding in order to eliminate self interference cache conflicts in numerical algorithms. Its performance impact in particular with respect to multithreaded program execution is shown by means of runtime figures.},
  topics    = {SAC,Array Data Layout},
  url       = {array-padding-lasvegas-00.pdf},
}

Sac is a functional array processing language that tries to combine generic, high-level program specifications with efficient runtime behavior. Being particularly designed for numerical applications, runtime performance critically depends on the effective utilization of the memory hierarchy. For many programs, however, it can be observed that the achieved performance significantly changes with small variations in the problem size. Array padding is a well-known optimization technique that adjusts the data layout of arrays in order to make better usage of caches. The paper presents an algorithm that derives a customized data layout from an array access pattern and a cache specification. Cache phenomena such as spatial and temporal reuse are taken into account as well as different cache architectures. The effectiveness is demonstrated by investigations on the runtime performance of the PDE1 benchmark on a shared memory multiprocessor.

DOIsac2c-codegen-wl-lochem-99.pdf BibTexAbstract

@InProceedings{GrelKreySchoIFL99,
  author     = {Clemens Grelck and Dietmar Kreye and Sven-Bodo Scholz},
  title      = {On Code Generation for Multi-generator With-loops in Sac},
  booktitle  = {Implementation of Functional Languages, 11th International Workshop (IFL'99), Lochem, The Netherlands, Selected Papers},
  year       = {2000},
  editor     = {Pieter Koopman and Chris Clack},
  volume     = {1868},
  series     = {Lecture Notes in Computer Science},
  pages      = {77--94},
  publisher  = {Springer},
  abstract   = {Most array operations in Sac are specified in terms of so-called with-loops, a sac-specific form of array comprehension. Due to the map-like semantics of with-loops its loop instances can be computed in any order which provides considerable freedom when it comes to compiling them into nestings of for-loops in C. This paper discusses several different execution orders and their impact on compilation complexity, size of generated code, and execution runtimes. As a result, a multiply parameterized compilation scheme is proposed which achieves speedups of up to a factor of 16 when compared against a naïve compilation scheme.},
  category   = {core,design},
  doi        = {10.1007/10722298_5},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper presents the essentials of Dietmar's Diploma thesis (see below). Besides an informal presentation of the compilation steps involved, the design choices made are substantiated by means of a few performance measurements.},
  topics     = {SAC,Loop Transformation},
  url        = {sac2c-codegen-WL-lochem-99.pdf},
}

Most array operations in Sac are specified in terms of so-called with-loops, a sac-specific form of array comprehension. Due to the map-like semantics of with-loops its loop instances can be computed in any order which provides considerable freedom when it comes to compiling them into nestings of for-loops in C. This paper discusses several different execution orders and their impact on compilation complexity, size of generated code, and execution runtimes. As a result, a multiply parameterized compilation scheme is proposed which achieves speedups of up to a factor of 16 when compared against a naïve compilation scheme.

1999

DOIsac-accel-apl-scranton-99.pdf BibTexAbstract

@InProceedings{GrelSchoAPL99,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Accelerating Apl Programs with Sac},
  booktitle = {International Conference on Array Processing Languages (APL'99), Scranton, Pennsylvania, USA},
  year      = {1999},
  editor    = {Olivier Lefèvre},
  volume    = {29},
  number    = {2},
  series    = {APL Quote Quad},
  pages     = {50--57},
  publisher = {ACM Press},
  abstract  = {The paper investigates, how SAC, a purely functional language based on C syntax, relates to APL in terms of expressiveness and run-time behavior. To do so, three different excerpts of real world APL programs are examined. It is shown that after defining the required APL primitives in SAC, the example programs can be re-written in SAC with an almost one-to-one correspondence. Run-time comparisons between interpreting APL programs and compiled SAC programs show that speedups due to compilation vary between 2 and 500 for three representative benchmark programs.},
  category  = {core,apps,design},
  doi       = {10.1145/379277.312719},
  summary   = {This paper compares SAC to APL in terms of expressiveness and execution runtimes. Some APL applications are compiled by hand into almost identical SAC programs and the resulting execution times are discussed.},
  topics    = {SAC,APL},
  url       = {sac-accel-apl-scranton-99.pdf},
}

The paper investigates, how SAC, a purely functional language based on C syntax, relates to APL in terms of expressiveness and run-time behavior. To do so, three different excerpts of real world APL programs are examined. It is shown that after defining the required APL primitives in SAC, the example programs can be re-written in SAC with an almost one-to-one correspondence. Run-time comparisons between interpreting APL programs and compiled SAC programs show that speedups due to compilation vary between 2 and 500 for three representative benchmark programs.

DOImt-support-london-98.pdf BibTexAbstract

@InProceedings{GrelckIFL98,
  author     = {Clemens Grelck},
  title      = {Shared Memory Multiprocessor Support for Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {38--54},
  publisher  = {Springer},
  abstract   = {Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.},
  category   = {core,design,par,opt},
  doi        = {10.1007/3-540-48515-5_3},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper introduces the basic concepts of multi-threaded program execution on shared memory multiprocessor systems. The described approach is completely implicit, i.e. the compiler and the runtime system handle everything from granularity to load balancing on their own. The paper also contains preliminary performance evaluations.},
  topics     = {SAC,Shared Memory},
  url        = {mt-support-london-98.pdf},
}

Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.

DOIwlf-exp-london-98.pdf BibTexAbstract

@InProceedings{ScholzIFL98,
  author     = {Sven-Bodo Scholz},
  title      = {A Case Study: Effects of With-loop Folding on the Nas Benchmark Mg in Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {216--228},
  publisher  = {Springer},
  abstract   = {Sac is a functional C variant with efficient support for high-level array operations. This paper investigates the applicability of a Sac specific optimization technique called WITH-loop-folding to real world applications. As an example program which originates from the Numerical Aerodynamic Simulation (NAS) Program developed at NASA Ames Research Center, the so-called NAS benchmark MG is chosen. It comprises a kernel from the NAS Program which implements 3-dimensional multigrid relaxation. Several run-time measurements exploit two different benefits of withloop-folding: First, an overall speed-up of about 20% can be observed. Second, a comparison between the run-times of a hand-optimized specification and of APL-like specifications yields identical run-times, although a naive compilation that does not apply With-loop-folding leads to slowdowns of more than an order of magnitude. Furthermore, With-loopfolding makes a slight variation of the algorithm feasible which substantially simplifies the program specification and requires less memory during execution. Finally, the optimized run-times are compared against run-times gained from the original Fortran program, which shows that for different problem sizes, the code generated from the Sac program does not only reach the execution times of the code generated from the Fortran program but even outperforms them by about 10%.},
  category   = {apps,core,opt},
  doi        = {10.1007/3-540-48515-5_14},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper investigates the impact of high-level program transformations, WITH-loop folding in particular, on the style of program specifications and the runtime performance of programs.},
  topics     = {SAC,Avoiding 1Temporaries,NAS},
  url        = {wlf-exp-london-98.pdf},
}

Sac is a functional C variant with efficient support for high-level array operations. This paper investigates the applicability of a Sac specific optimization technique called WITH-loop-folding to real world applications. As an example program which originates from the Numerical Aerodynamic Simulation (NAS) Program developed at NASA Ames Research Center, the so-called NAS benchmark MG is chosen. It comprises a kernel from the NAS Program which implements 3-dimensional multigrid relaxation. Several run-time measurements exploit two different benefits of withloop-folding: First, an overall speed-up of about 20% can be observed. Second, a comparison between the run-times of a hand-optimized specification and of APL-like specifications yields identical run-times, although a naive compilation that does not apply With-loop-folding leads to slowdowns of more than an order of magnitude. Furthermore, With-loopfolding makes a slight variation of the algorithm feasible which substantially simplifies the program specification and requires less memory during execution. Finally, the optimized run-times are compared against run-times gained from the original Fortran program, which shows that for different problem sizes, the code generated from the Sac program does not only reach the execution times of the code generated from the Fortran program but even outperforms them by about 10%.

1998

DOIsac-defining-array-ops-rome-98.pdf BibTexAbstract

@InProceedings{ScholzAPL98,
  author    = {Sven-Bodo Scholz},
  title     = {On Defining Application-specific High-level Array Operations by Means of Shape-invariant Programming Facilities},
  booktitle = {International Conference on Array Processing Languages (APL'98), Rome, Italy},
  year      = {1998},
  editor    = {Sergio Picchi and Marco Micocci},
  pages     = {40--45},
  publisher = {ACM Press},
  abstract  = {Most of the existing high-level array-processing languages support a fixed set of pre-defined array operations and a few higher-order functions for constructing new array operations from existing ones. In this paper, we discuss a more general approach made feasible by SAC (for Single Assignement C), a functional variant of C.SAC provides a meta-level language construct called WITH-loop which may be considered a sophisticated variant of the FORALL-loops in HPF or of array comprehensions in functional languages. It allows for the element-wise specification of high-level operations on arrays of any dimensionality: any set of high-level array operations can be specified by means of WITH-loops and be made available in a library. This not only improves the flexibility of specifications, but also simplifies the compilation process.By means of a few examples it is shown that the high-level operations that are typically available in array processing languages such as APL or FORTRAN90 can be easily specified as WITH-loops in SAC. Furthermore, we briefly outline the most important optimization techniques used in the current SAC compiler for achieving efficiently executable code.The paper finally presents a performance comparison between a high-level specification for the multigrid relaxation kernel of the NAS benchmarks in SAC on the one hand and low-level specifications in SISAL and in FORTRAN77 on the other hand. It shows that the SAC implementation, despite its higher level of abstraction, is competitive with the other two both in terms of program runtimes and memory consumption.},
  category  = {core,design},
  doi       = {10.1145/327559.327613},
  summary   = {This paper shows how APL programs can be translated into equivalent but substantially faster SAC programs with little effort.},
  topics    = {SAC},
  url       = {sac-defining-array-ops-rome-98.pdf},
}

Most of the existing high-level array-processing languages support a fixed set of pre-defined array operations and a few higher-order functions for constructing new array operations from existing ones. In this paper, we discuss a more general approach made feasible by SAC (for Single Assignement C), a functional variant of C.SAC provides a meta-level language construct called WITH-loop which may be considered a sophisticated variant of the FORALL-loops in HPF or of array comprehensions in functional languages. It allows for the element-wise specification of high-level operations on arrays of any dimensionality: any set of high-level array operations can be specified by means of WITH-loops and be made available in a library. This not only improves the flexibility of specifications, but also simplifies the compilation process.By means of a few examples it is shown that the high-level operations that are typically available in array processing languages such as APL or FORTRAN90 can be easily specified as WITH-loops in SAC. Furthermore, we briefly outline the most important optimization techniques used in the current SAC compiler for achieving efficiently executable code.The paper finally presents a performance comparison between a high-level specification for the multigrid relaxation kernel of the NAS benchmarks in SAC on the one hand and low-level specifications in SISAL and in FORTRAN77 on the other hand. It shows that the SAC implementation, despite its higher level of abstraction, is competitive with the other two both in terms of program runtimes and memory consumption.

With-loop-folding in Sac — Condensing Consecutive Array Operations Sven-Bodo Scholz (1998). In Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers. pp. 72–92. Springer. DOIwlf-st-andrews-97.pdf BibTexAbstract

@InProceedings{ScholzIFL1997,
  author     = {Sven-Bodo Scholz},
  title      = {With-loop-folding in Sac --- Condensing Consecutive Array Operations},
  booktitle  = {Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers},
  year       = {1998},
  editor     = {Chris Clack and Tony Davie and Kevin Hammond},
  volume     = {1467},
  series     = {Lecture Notes in Computer Science},
  pages      = {72--92},
  publisher  = {Springer},
  abstract   = {This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.},
  category   = {core,design,opt},
  doi        = {10.1007/BFb0055425},
  isbn       = {978-3-540-64849-9},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries,Implementation of Arrays},
  url        = {wlf-st-andrews-97.pdf},
}

This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.

1997

An Overview of Sc Sac – a Functional Language for Numerical Applications Sven-Bodo Scholz (1997). In Programming Languages and Fundamentals of Programming, Technical Report 9717. Institut für Informatik und Praktische Mathe\-matik, Universität Kiel. BibTex

@InProceedings{ScholzKPS97,
  author    = {Sven-Bodo Scholz},
  title     = {An Overview of Sc Sac -- a Functional Language for Numerical Applications},
  booktitle = {Programming Languages and Fundamentals of Programming, Technical Report 9717},
  year      = {1997},
  editor    = {R. Berghammer and F. Simon},
  publisher = {{Institut f{\"u}r Informatik und Praktische Mathe\-matik, Universit{\"a}t Kiel}},
  category  = {core,design,apps},
  topics    = {SAC,on SAC},
}

scientific-applications-sac-bonn-96.pdf BibTexAbstract

@InProceedings{ScholzIFL96,
  author     = {Sven-Bodo Scholz},
  title      = {On Programming Scientific Applications in Sac - a Functional Language Extended by a Subsystem for High-level Array Operations},
  booktitle  = {Implementation of Functional Languages, 8th International Workshop (IFL'96), Bonn, Germany, Selected Papers},
  year       = {1997},
  editor     = {Werner Kluge},
  volume     = {1268},
  series     = {Lecture Notes in Computer Science},
  pages      = {85--104},
  publisher  = {Springer-Verlag},
  abstract   = {This paper discusses some of the pros andc ons of extending a simple functional language called Sac (for Single-Assignment C) by array operations similar to those that are available in APL. The array operations in Sac are based on the ψ-calculus, an algebra of arrays which provides a formalism for specifying and simplifying array operations in terms of index set manipulations. The programming techniques made possible by Sac are demonstrated by means of a functional program for the approximation of numerical solutions of partial differential equations by multigrid relaxation. This application is notonly of practical relevance but also fully exposes the flavors of using high-level array operations. In contrast to specifcations in other languages, e.g. in Fortran or Sisal, the Sac program is well structured, reasonably concise, and - what is most important - invariant against dimensionalities and shapes. However, sophisticated compilation techniques are necessary to avoid - whenever possible - the creation of temporary arrays and to eliminate redundant operations. The paper also includes performance figures for a Sac implementation of the NAS-mgrid-benchmark which are competetive with those of a Sisal implementation.},
  category   = {apps,core,opt},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper describes the high-level programming style intended by SAC. Multi-dimensional multigrid relaxation is investigated as a case study. The runtime performance of the SAC implementation is compared to equivalent Sisal and Fortran specifications both in terms of execution time and memory consumption.},
  topics     = {SAC,Scientific Computation},
  url        = {scientific-applications-sac-bonn-96.pdf},
}

This paper discusses some of the pros andc ons of extending a simple functional language called Sac (for Single-Assignment C) by array operations similar to those that are available in APL. The array operations in Sac are based on the ψ-calculus, an algebra of arrays which provides a formalism for specifying and simplifying array operations in terms of index set manipulations. The programming techniques made possible by Sac are demonstrated by means of a functional program for the approximation of numerical solutions of partial differential equations by multigrid relaxation. This application is notonly of practical relevance but also fully exposes the flavors of using high-level array operations. In contrast to specifcations in other languages, e.g. in Fortran or Sisal, the Sac program is well structured, reasonably concise, and - what is most important - invariant against dimensionalities and shapes. However, sophisticated compilation techniques are necessary to avoid - whenever possible - the creation of temporary arrays and to eliminate redundant operations. The paper also includes performance figures for a Sac implementation of the NAS-mgrid-benchmark which are competetive with those of a Sisal implementation.

1996

@PhdThesis{Scholz96,
  author   = {Sven-Bodo Scholz},
  title    = {Single Assignment C -- Entwurf Und Implementierung Einer Funktionalen C-variante Mit Spezieller Unterstützung Shape-invarianter Array-operationen},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  note     = {Shaker Verlag, Aachen, 1997},
  category = {design,core,types,apps},
  summary  = {Sven-Bodo's PhD thesis is the most comprehensive document about SAC. The language design is described in detail. Moreover, a formal semantics of SAC is given. Further points of emphasis are the type system and the compilation process. Multigrid relaxation is used as a case study to illustrate programming with SAC.},
  topics   = {SAC,Language Manuals,Implementation of Arrays,Language Design},
  url      = {sac-design-sbs-phd-96.pdf},
}

@MastersThesis{Grelck96,
  author   = {Clemens Grelck},
  title    = {Integration Eines Modul- Und Klassen-konzeptes in Die Funktionale Programmiersprache Sac -- Single Assignment C},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  category = {design,core,modules},
  summary  = {Clemens' Diploma thesis is a detailed description of the module system, the integration of states and state modifications including the realization of powerful I/O facilities, and the foreign language interface which allows to use functions implemented in C within SAC programs. Both language design and implementation issues are covered.},
  topics   = {SAC,Uniqueness Types,Language Interfacing},
  url      = {sac2c-modules-classes-dipl-cg-96.pdf},
}

1995

sac-classes-objects-bastad-95.pdf BibTexAbstract

@InProceedings{GrelSchoIFL95,
  author    = {Grelck, Clemens and Scholz, Sven-Bodo},
  title     = {Classes and Objects As Basis for I/o in Sac},
  booktitle = {7th International Workshop on Implementation of Functional Languages (IFL'95), B{\aa}stad, Sweden},
  year      = {1995},
  editor    = {Thomas Johnsson},
  pages     = {30--44},
  publisher = {Chalmers University of Technology, Gothenburg, Sweden},
  abstract  = {In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce
and manipulate states of arbitrary data-structures within Sac.},
  category  = {design,core,states},
  summary   = {This paper describes the way states and state modifications are incorporated into the purely functional world of SAC based on uniqueness typing and an extension of the module system. The integration of I/O facilities is taken as a case study. The paper is the best English introduction to this part of SAC.},
  topics    = {SAC,Uniqueness Types},
  url       = {sac-classes-objects-bastad-95.pdf},
}

In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce and manipulate states of arbitrary data-structures within Sac.

1994

@InProceedings{ScholzIFL94,
  author    = {Sven-Bodo Scholz},
  title     = {Single-assignment C --- Functional Programming Using Imperative Style},
  booktitle = {6th International Workshop on Implementation of Functional Languages (IFL'94), Norwich, England, UK},
  year      = {1994},
  editor    = {John Glauert},
  pages     = {211--2113},
  publisher = {University of East Anglia, Norwich, England, UK},
  category  = {design,core,types},
  summary   = {This is the first paper to propose SAC. The basic language design ideas are explained and its syntax is defined. However, SAC has changed a lot since then. So, this classic paper can no longer be recommended as an introductory source of information about SAC.},
  topics    = {SAC,Implementation of Arrays,Language Design},
  url       = {sac-overview-norwich-94.pdf},
}

2009

Controllling Chaos — on Safe Side-effects in Data-parallel Operations Stephan Herhut, Sven-Bodo Scholz and Clemens Grelck (2009). In 4th Workshop on Declarative Aspects of Multicore Programming (DAMP'09), Savannah, USA. pp. 59–67. ACM Press. DOI2009_5.pdf BibTexAbstract

@InProceedings{HerhSchoGrelDAMP09,
  author     = {Stephan Herhut and Sven-Bodo Scholz and Clemens Grelck},
  title      = {Controllling Chaos --- on Safe Side-effects in Data-parallel Operations},
  booktitle  = {4th Workshop on Declarative Aspects of Multicore Programming (DAMP'09), Savannah, USA},
  year       = {2009},
  editor     = {Manuel Chakravarty and Leaf Peterson},
  pages      = {59--67},
  publisher  = {ACM Press},
  abstract   = {With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.},
  affil      = {ctca},
  category   = {par,states},
  doi        = {10.1145/1481839.1481847},
  isbn       = {978-1-60558-417-1},
  pubaddress = {New York, NY, USA},
  topics     = {SAC},
  url        = {2009_5.pdf},
}

With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.

Controllling Chaos — on Safe Side-effects in Data-parallel Operations Stephan Herhut, Sven-Bodo Scholz and Clemens Grelck (2009). ACM SIGPLAN Notices 44 (5) pp. 9–10. 2009_6.pdf BibTexAbstract

@Article{HerhSchoGrelSPN09,
  author   = {Stephan Herhut and Sven-Bodo Scholz and Clemens Grelck},
  title    = {Controllling Chaos --- on Safe Side-effects in Data-parallel Operations},
  journal  = {ACM SIGPLAN Notices},
  year     = {2009},
  volume   = {44},
  number   = {5},
  pages    = {9--10},
  abstract = {With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.},
  affil    = {ctca},
  category = {par,states},
  issn     = {0362-1340},
  topics   = {SAC},
  url      = {2009_6.pdf},
}

With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.

1995

sac-classes-objects-bastad-95.pdf BibTexAbstract

@InProceedings{GrelSchoIFL95,
  author    = {Grelck, Clemens and Scholz, Sven-Bodo},
  title     = {Classes and Objects As Basis for I/o in Sac},
  booktitle = {7th International Workshop on Implementation of Functional Languages (IFL'95), B{\aa}stad, Sweden},
  year      = {1995},
  editor    = {Thomas Johnsson},
  pages     = {30--44},
  publisher = {Chalmers University of Technology, Gothenburg, Sweden},
  abstract  = {In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce
and manipulate states of arbitrary data-structures within Sac.},
  category  = {design,core,states},
  summary   = {This paper describes the way states and state modifications are incorporated into the purely functional world of SAC based on uniqueness typing and an extension of the module system. The integration of I/O facilities is taken as a case study. The paper is the best English introduction to this part of SAC.},
  topics    = {SAC,Uniqueness Types},
  url       = {sac-classes-objects-bastad-95.pdf},
}

In imperative languages I/O is realized through sequences of side-effecting function applications/ procedure invocations.This seems to be a suitable way of specifying I/O since it coincides with an intuitive understanding of it as sequences of actions. In contrast, functional languages carefully have to avoid side-effects to sustain referential transparency. Many different solutions, such as dialogues, continuations, monads and uniqueness typing have been proposed. The I/O facilities of Sac are based on uniqueness typing.Instead of using an explicit type attribute as in Clean, unique types are introduced as special modules called classes. To provide a syntax as close as possible to that of imperative languages, we propose two new mechanisms to be included on top of classes in Sac: a call-by-reference mechanism and global objects. Although a combination of both mechanisms allows for very imperative-like notations, we can define a purely functional semantics. Thus we combine the advantages of referential transparency with the expressiveness of imperative I/O. Moreover, these two mechanisms allow the programmer to introduce and manipulate states of arbitrary data-structures within Sac.

2022

DOIcuyckscholzifl21.pdf BibTexAbstract

@inproceedings{CuycSchoIFL21,
 author = {van Cuyck, Gijs and Scholz, Sven-Bodo},
 title = {In-Place-Folding of Non-Scalar Hyper-Planes of Multi-Dimensional Arrays},
 year = {2022},
 isbn = {9781450386449},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {CuyckScholzIFL21.pdf},
 doi = {10.1145/3544885.3544893},
 summary = {Doing non-scalar folds in place.},
 category = {core, opt},
 abstract = {Memory management plays a key role when trying to compile functional programs into efficiently executable code. In particular when using flat representations for multi-dimensional arrays, i.e., when using a single memory block for the entire data of a multi-dimensional array, in-place updates become crucial for highly competitive performance. This paper proposes a novel code generation technique for performing fold-operations on hyper-planes of multi-dimensional arrays, where the fold-operation itself operates on non-scalar sub-arrays, i.e., on vectors or higher-dimensional arrays. This technique allows for a single result array allocation over the entire folding operation without requiring the folding operation itself to be scalarised. It enables the utilisation of vector operations without any added memory allocation or copying overhead. We describe our technique in the context of SaC, sketch our implementation in the context of the compiler sac2c and provide some initial performance measurements that give an indication of the effectiveness of this new technique.},
 booktitle = {33rd Symposium on Implementation and Application of Functional Languages},
 pages = {29–41},
 numpages = {13},
 keywords = {reference counting, compiler optimisation, memory management, SaC},
 location = {Nijmegen, Netherlands},
 series = {IFL '21} 
}

Memory management plays a key role when trying to compile functional programs into efficiently executable code. In particular when using flat representations for multi-dimensional arrays, i.e., when using a single memory block for the entire data of a multi-dimensional array, in-place updates become crucial for highly competitive performance. This paper proposes a novel code generation technique for performing fold-operations on hyper-planes of multi-dimensional arrays, where the fold-operation itself operates on non-scalar sub-arrays, i.e., on vectors or higher-dimensional arrays. This technique allows for a single result array allocation over the entire folding operation without requiring the folding operation itself to be scalarised. It enables the utilisation of vector operations without any added memory allocation or copying overhead. We describe our technique in the context of SaC, sketch our implementation in the context of the compiler sac2c and provide some initial performance measurements that give an indication of the effectiveness of this new technique.

2018

DOIextended-memory-reuse-ifl18.pdf BibTexAbstract

@inproceedings{ViesSinkSchoIFL18,
author = {Vie\ss{}mann, Hans-Nikolai and \v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
title = {Extended Memory Reuse: An Optimisation for Reducing Memory Allocations},
year = {2018},
isbn = {9781450371438},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
doi = {10.1145/3310232.3310242},
summary = {Explains all details of the optimisation EMR.},
url = {extended-memory-reuse-IFL18.pdf},
abstract = {In this paper we present an optimisation for reference counting based garbage collection. The optimisation aims at reducing the total number of calls to the heap manager while preserving the key benefits of reference counting, i.e. the opportunities for in-place updates as well as memory deallocation without global garbage collection. The key idea is to carefully extend the lifetime of variables so that memory deallocations followed by memory allocations of the same size can be replaced by a direct memory reuse. Such memory reuse turns out particularly useful in the context of innermost loops of compute-intensive applications. It leads to a runtime behaviour that performs pointer swaps between buffers in the same way it would be implemented manually in languages that require explicit memory management, e.g. C.We have implemented the proposed optimisation in the context of the Single-Assignment C compiler tool chain. The paper provides an algorithmic description of our optimisation and an evaluation of its effectiveness over a collection of benchmarks including a subset of the Rodinia benchmarks and the NAS Parallel Benchmarks. We show that for several benchmarks with allocations within loops our optimisation reduces the amount of allocations by a few orders of magnitude. We also observe no negative impact on the overall memory footprint nor on the overall runtime. Instead, for some sequential executions we find mild improvement, and on GPU devices we observe speedups of up to a factor of 4x.},
booktitle = {Proceedings of the 30th Symposium on Implementation and Application of Functional Languages},
pages = {107–118},
numpages = {12},
category = {core,opts},
keywords = {reference counting, memory management, compiler optimisation},
location = {Lowell, MA, USA},
series = {IFL 2018}
}

In this paper we present an optimisation for reference counting based garbage collection. The optimisation aims at reducing the total number of calls to the heap manager while preserving the key benefits of reference counting, i.e. the opportunities for in-place updates as well as memory deallocation without global garbage collection. The key idea is to carefully extend the lifetime of variables so that memory deallocations followed by memory allocations of the same size can be replaced by a direct memory reuse. Such memory reuse turns out particularly useful in the context of innermost loops of compute-intensive applications. It leads to a runtime behaviour that performs pointer swaps between buffers in the same way it would be implemented manually in languages that require explicit memory management, e.g. C.We have implemented the proposed optimisation in the context of the Single-Assignment C compiler tool chain. The paper provides an algorithmic description of our optimisation and an evaluation of its effectiveness over a collection of benchmarks including a subset of the Rodinia benchmarks and the NAS Parallel Benchmarks. We show that for several benchmarks with allocations within loops our optimisation reduces the amount of allocations by a few orders of magnitude. We also observe no negative impact on the overall memory footprint nor on the overall runtime. Instead, for some sequential executions we find mild improvement, and on GPU devices we observe speedups of up to a factor of 4x.

2015

DOI BibTexAbstract

@InProceedings{GordonScholzIFL15,
 author = {Gordon, Stuart and Scholz, Sven-Bodo},
 title = {Dynamic Adaptation of Functional Runtime Systems Through External Control},
 booktitle = {Proceedings of the 27th Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '15},
 year = {2015},
 isbn = {978-1-4503-4273-5},
 location = {Koblenz, Germany},
 pages = {10:1--10:13},
 articleno = {10},
 numpages = {13},
 doi = {10.1145/2897336.2897347},
 acmid = {2897347},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {SAC, array programming, compilers, dynamic adaptation, functional programming, high performance computing, high-level parallel programming, over-subscription},
 category = {core, opt},
 abstract = {In this paper, we present a novel approach towards providing compiler generated runtime means for dynamic adaptation. The key novelty of the proposed solution is a complete separation between the runtime adaptation mechanism itself and the control mechanism of the adaptation. This strict separation has various benefits including experimentation with adaptation control mechanisms without modifying the runtime system itself, and opening up the possibility for control mechanisms that control several applications possibly even across different runtime systems. The paper does not only present the basic principles of the approach taken, it also describes our prototypical implementation in the context of the functional array programming language SaC. The dynamic adaptation we are concerned with in the context of this paper is an adaptation in the number of computing cores utilised during parallel execution on multiple cores. We show how this adaptation mechanism can be leveraged to deal with changes in the background load generated from other applications running concurrently with the application whose parallelism we want to control.},
 summary = {This paper describes the first cut on providing a runtime interface for continuous external adaptation of SaC programs. It also demonstrates how this can be used to adapt the number of threads used within a with-loop solely based on previous execution times of the same with-loop.},
}

In this paper, we present a novel approach towards providing compiler generated runtime means for dynamic adaptation. The key novelty of the proposed solution is a complete separation between the runtime adaptation mechanism itself and the control mechanism of the adaptation. This strict separation has various benefits including experimentation with adaptation control mechanisms without modifying the runtime system itself, and opening up the possibility for control mechanisms that control several applications possibly even across different runtime systems. The paper does not only present the basic principles of the approach taken, it also describes our prototypical implementation in the context of the functional array programming language SaC. The dynamic adaptation we are concerned with in the context of this paper is an adaptation in the number of computing cores utilised during parallel execution on multiple cores. We show how this adaptation mechanism can be leveraged to deal with changes in the background load generated from other applications running concurrently with the application whose parallelism we want to control.

Type-driven Data Layouts for Improved Vectorisation Artjoms Šinkarovs and Sven-Bodo Scholz (May 2015). Concurrency and Computation: Practice and Experience 28 (7) pp. 2092–2119. Wiley-Blackwell. DOI BibTexAbstract

@Article{SinkSchoCCPE15,
  author    = {Šinkarovs, Artjoms and Scholz, Sven-Bodo},
  title     = {Type-driven Data Layouts for Improved Vectorisation},
  journal   = {Concurrency and Computation: Practice and Experience},
  year      = {2015},
  volume    = {28},
  number    = {7},
  pages     = {2092--2119},
  month     = {May},
  abstract  = {Vector instructions of modern CPUs are crucially important for the performance of compute-intensive algorithms. Auto-vectorisation often fails because of an unfortunate choice of data layout by the programmer. This paper proposes a data layout inference for auto-vectorisation that identifies layout transformations that convert single instruction, multiple data-unfavourable layouts of data structures into favourable ones. We present a type system for layout transformations, and we sketch an inference algorithm for it. Finally, we present some initial performance figures for the impact of the inferred layout transformations. They show that non-intuitive layouts that are inferred through our system can have a vast performance impact on compute intensive programs.},
  category  = {types,opt,par},
  doi       = {10.1002/cpe.3501},
  issn      = {1532-0626},
  keywords  = {Data parallelism, Hpc, Vectorisation},
  publisher = {Wiley-Blackwell},
}

Vector instructions of modern CPUs are crucially important for the performance of compute-intensive algorithms. Auto-vectorisation often fails because of an unfortunate choice of data layout by the programmer. This paper proposes a data layout inference for auto-vectorisation that identifies layout transformations that convert single instruction, multiple data-unfavourable layouts of data structures into favourable ones. We present a type system for layout transformations, and we sketch an inference algorithm for it. Finally, we present some initial performance figures for the impact of the inferred layout transformations. They show that non-intuitive layouts that are inferred through our system can have a vast performance impact on compute intensive programs.

2014

impact2014-guo.pdf BibTexAbstract

@inproceedings{GuoBernThiySchoIMPACT14,
  author = {Jing Guo and Robert Bernecky and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
  title = {Polyhedral Methods for Improving Parallel Update-in-Place},
  abstract = {We demonstrate an optimization, denoted as polyhedral reuse analysis (PRA), that uses polyhedral methods to improve the analysis of in-place update for single-assignment arrays. The PRA optimization attempts to determine when parallel array operations that jointly define new arrays from exist- ing ones can reuse the memory of the existing arrays, rather than creating new ones. Polyhedral representations and re- lated dependency inference methods facilitate that analysis.
In the context of SaC, we demonstrate the impact of this optimisation using two non-trivial benchmarks evaluated on conventional shared memory machines and on GPUs, ob- taining performance improvements of 2–8 times for LU De- composition and of 2–10 times for Needleman-Wunsch, over the same computations with PRA disabled.},
  summary = {Describes polyhedral reuse analysis.},
  booktitle = {Proceedings of the 4th International Workshop on Polyhedral Compilation Techniques},
  editor = {Rajopadhye, Sanjay and Verdoolaege, Sven},
  year   = 2014,
  month  = Jan,
  address = {Vienna, Austria},
  url = {impact2014-guo.pdf},
  category = {opt}
}

We demonstrate an optimization, denoted as polyhedral reuse analysis (PRA), that uses polyhedral methods to improve the analysis of in-place update for single-assignment arrays. The PRA optimization attempts to determine when parallel array operations that jointly define new arrays from exist- ing ones can reuse the memory of the existing arrays, rather than creating new ones. Polyhedral representations and re- lated dependency inference methods facilitate that analysis. In the context of SaC, we demonstrate the impact of this optimisation using two non-trivial benchmarks evaluated on conventional shared memory machines and on GPUs, ob- taining performance improvements of 2–8 times for LU De- composition and of 2–10 times for Needleman-Wunsch, over the same computations with PRA disabled.

2012

Asynchronous Adaptive Optimisation for Generic Data-parallel Array Programming Clemens Grelck, Tim van Deurzen, Stephan Herhut and Sven-Bodo Scholz (2012). Concurrency and Computation: Practice and Experience 24 (5) pp. 499–516. DOI2012_2.pdf BibTexAbstract

@Article{GrelDeurHerhetalCCPE12,
  author   = {Clemens Grelck and Tim van Deurzen and Stephan Herhut and Sven-Bodo Scholz},
  title    = {Asynchronous Adaptive Optimisation for Generic Data-parallel Array Programming},
  journal  = {Concurrency and Computation: Practice and Experience},
  year     = {2012},
  volume   = {24},
  number   = {5},
  pages    = {499--516},
  abstract = {Programming productivity very much depends on the availability of basic building blocks that can be reused for a wide range of application scenarios and the ability to define rich abstraction hierarchies. Driven by the aim for increased reuse, such basic building blocks tend to become more and more generic in their specification; structural as well as behavioural properties are turned into parameters that are passed on to lower layers of abstraction where eventually a differentiation is being made. In the context of array programming, such properties are typically array ranks (number of axes/dimensions) and array shapes (number of elements along each axis/dimension). This allows for abstract definitions of operations such as element-wise additions, concatenations, rotations, and so on, which jointly enable a very high-level compositional style of programming, similar to, for instance, MATLAB. However, such a generic programming style generally comes at a price in terms of runtime overheads when compared against tailor-made low-level implementations. Additional layers of abstraction as well as the lack of hard-coded structural properties often inhibits optimisations that are obvious otherwise. Although complex static compiler analyses and transformations such as partial evaluations can ameliorate the situation to quite some extent, there are cases, where the required level of information is not available until runtime. In this paper, we propose to shift part of the optimisation process into the runtime of applications. Triggered by some runtime observation, the compiler asynchronously applies partial evaluation techniques to frequently used program parts and dynamically replaces initial program fragments by more specialised ones through dynamic re-linking. In contrast to many existing approaches, we suggest this optimisation to be done in a rather non-intrusive, decoupled way. We use a full-fledged compiler that is run on a separate core. This measure enables us to run the compiler on its highest optimisation-level, which requires non-negligible compilation times for our optimisations. We use the compiler's type system to identify the potential dynamic optimisations. And we use the host language's module system as a facilitator for the dynamic code modifications. We present the architecture and implementation of an adaptive compilation framework for Single Assignment C, a data-parallel array programming language. Single Assignment C advocates shape-generic and rank-generic programming with arrays. A sophisticated, highly optimising compiler technology nevertheless achieves competitive runtime performance. We demonstrate the suitability of our approach to achieve consistently high performance independent of the static availability of array properties by means of several experiments based on a highly generic formulation of rank-invariant convolution as a case study. Copyright © 2011 John Wiley & Sons, Ltd.},
  affil    = {ctca},
  category = {opt},
  doi      = {10.1002/cpe.1842},
  topics   = {SAC},
  url      = {2012_2.pdf},
}

Programming productivity very much depends on the availability of basic building blocks that can be reused for a wide range of application scenarios and the ability to define rich abstraction hierarchies. Driven by the aim for increased reuse, such basic building blocks tend to become more and more generic in their specification; structural as well as behavioural properties are turned into parameters that are passed on to lower layers of abstraction where eventually a differentiation is being made. In the context of array programming, such properties are typically array ranks (number of axes/dimensions) and array shapes (number of elements along each axis/dimension). This allows for abstract definitions of operations such as element-wise additions, concatenations, rotations, and so on, which jointly enable a very high-level compositional style of programming, similar to, for instance, MATLAB. However, such a generic programming style generally comes at a price in terms of runtime overheads when compared against tailor-made low-level implementations. Additional layers of abstraction as well as the lack of hard-coded structural properties often inhibits optimisations that are obvious otherwise. Although complex static compiler analyses and transformations such as partial evaluations can ameliorate the situation to quite some extent, there are cases, where the required level of information is not available until runtime. In this paper, we propose to shift part of the optimisation process into the runtime of applications. Triggered by some runtime observation, the compiler asynchronously applies partial evaluation techniques to frequently used program parts and dynamically replaces initial program fragments by more specialised ones through dynamic re-linking. In contrast to many existing approaches, we suggest this optimisation to be done in a rather non-intrusive, decoupled way. We use a full-fledged compiler that is run on a separate core. This measure enables us to run the compiler on its highest optimisation-level, which requires non-negligible compilation times for our optimisations. We use the compiler's type system to identify the potential dynamic optimisations. And we use the host language's module system as a facilitator for the dynamic code modifications. We present the architecture and implementation of an adaptive compilation framework for Single Assignment C, a data-parallel array programming language. Single Assignment C advocates shape-generic and rank-generic programming with arrays. A sophisticated, highly optimising compiler technology nevertheless achieves competitive runtime performance. We demonstrate the suitability of our approach to achieve consistently high performance independent of the static availability of array properties by means of several experiments based on a highly generic formulation of rank-invariant convolution as a case study. Copyright © 2011 John Wiley & Sons, Ltd.

2011

Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler Jing Guo, Jeyarajan Thiyagalingam and Sven-Bodo Scholz (2011). In 6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA. pp. 15–24. ACM Press. DOI2011_5.pdf BibTexAbstract

@InProceedings{GuoJeyaSchoDAMP11,
  author     = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
  title      = {Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler},
  booktitle  = {6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA},
  year       = {2011},
  pages      = {15--24},
  publisher  = {ACM Press},
  abstract   = {Over recent years, the use of Graphics Processing Units (GPUs) for general-purpose computing has become increasingly popular. The main reasons for this development are the attractive performance/price and performance/power ratios of these architectures. However, substantial performance gains from GPUs come at a price: they require extensive programming expertise and, typically, a substantial re-coding effort. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices. Directive-based approaches such as hiCUDA or OpenMP-variants offer further improvements but have not eliminated the need for the expertise on these complex architectures. Similarly, special purpose programming languages such as Microsoft's Accelerator try to lower the barrier further. They provide the programmer with a special form of GPU data structures and operations on them which are then compiled into GPU code. In this paper, we take this trend towards a completely implicit, high-level approach yet another step further. We generate CUDA code from a MATLAB-like high level functional array programming language, Single Assignment C (SaC). To do so, we identify which data structures and operations can be successfully mapped on GPUs and transform existing programs accordingly. This paper presents the first runtime results from our GPU backend and it presents the basic set of GPU-specific program optimisations that turned out to be essential. Despite our high-level program specifications, we show that for a number of benchmarks speedups between a factor of 5 and 50 can be achieved through our parallelising compiler.},
  affil      = {ctca},
  category   = {par,core,opt},
  doi        = {10.1145/1926354.1926359},
  pubaddress = {New York, NY, USA},
  url        = {2011_5.pdf},
}

Over recent years, the use of Graphics Processing Units (GPUs) for general-purpose computing has become increasingly popular. The main reasons for this development are the attractive performance/price and performance/power ratios of these architectures. However, substantial performance gains from GPUs come at a price: they require extensive programming expertise and, typically, a substantial re-coding effort. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices. Directive-based approaches such as hiCUDA or OpenMP-variants offer further improvements but have not eliminated the need for the expertise on these complex architectures. Similarly, special purpose programming languages such as Microsoft's Accelerator try to lower the barrier further. They provide the programmer with a special form of GPU data structures and operations on them which are then compiled into GPU code. In this paper, we take this trend towards a completely implicit, high-level approach yet another step further. We generate CUDA code from a MATLAB-like high level functional array programming language, Single Assignment C (SaC). To do so, we identify which data structures and operations can be successfully mapped on GPUs and transform existing programs accordingly. This paper presents the first runtime results from our GPU backend and it presents the basic set of GPU-specific program optimisations that turned out to be essential. Despite our high-level program specifications, we show that for a number of benchmarks speedups between a factor of 5 and 50 can be achieved through our parallelising compiler.

2010

Symbiotic Expressions Robert Bernecky, Stephan Herhut and Sven-Bodo Scholz (2010). In Implementation and Application of Functional Languages, 21st International Symposium, IFL 2009, South Orange, NJ, USA. pp. 107–126. Springer. DOI2011_7.pdf BibTexAbstract

@InProceedings{BernHerhSchoIFL09,
  author    = {Robert Bernecky and Stephan Herhut and Sven-Bodo Scholz},
  title     = {Symbiotic Expressions},
  booktitle = {Implementation and Application of Functional Languages, 21st International Symposium, IFL 2009, South Orange, NJ, USA},
  year      = {2010},
  editor    = {Marco T. Morazan and Sven-Bodo Scholz},
  number    = {6041},
  series    = {Lecture Notes in Computer Science},
  pages     = {107--126},
  publisher = {Springer},
  affil     = {ctca},
  summary   = {},
  abstract  = {We introduce symbiotic expressions, a method for algebraic simplification within a compiler, in lieu of an SMT solver, such as Yices or the Omega Calculator. Symbiotic expressions are compiler-generated expressions, temporarily injected into a program's abstract syntax tree (AST). The compiler's normal optimizations interpret and simplify those expressions, making their results available for the compiler to use as a basis for decisions about further optimization of the source program. The expressions are symbiotic, in the sense that both parties benefit: an optimization benefits, by using the compiler itself to simplify expressions that have been attached, lamprey-like, to the AST by the optimization; the program being compiled benefits, from improved run-time in both serial and parallel environments. We show the utility of symbiotic expressions by using them to extend the SAC compiler's With-Loop-Folding optimization, currently limited to Arrays of Known Shape (AKS), to Arrays of Known Dimensionality (AKD). We show that, in conjunction with array-based constant-folding, injection and propagation of array extrema, and compiler-based expression simplification, symbiotic expressions are an effective tool for implementing advanced array optimizations. Symbiotic expressions are also simpler and more likely to be correct than hard-coded analysis, and are flexible and relatively easy to use. Finally, symbiotic expressions are synergistic: they take immediate advantage of new or improved optimizations in the compiler. Symbiotic expressions are a useful addition to a compiler writer's toolkit, giving the compiler a restricted subset of the analysis power of an SMT solver.},
  category  = {opt},
  DOI       = {10.1007/978-3-642-16478-1_7},
  URL       = {2011_7.pdf},
}

We introduce symbiotic expressions, a method for algebraic simplification within a compiler, in lieu of an SMT solver, such as Yices or the Omega Calculator. Symbiotic expressions are compiler-generated expressions, temporarily injected into a program's abstract syntax tree (AST). The compiler's normal optimizations interpret and simplify those expressions, making their results available for the compiler to use as a basis for decisions about further optimization of the source program. The expressions are symbiotic, in the sense that both parties benefit: an optimization benefits, by using the compiler itself to simplify expressions that have been attached, lamprey-like, to the AST by the optimization; the program being compiled benefits, from improved run-time in both serial and parallel environments. We show the utility of symbiotic expressions by using them to extend the SAC compiler's With-Loop-Folding optimization, currently limited to Arrays of Known Shape (AKS), to Arrays of Known Dimensionality (AKD). We show that, in conjunction with array-based constant-folding, injection and propagation of array extrema, and compiler-based expression simplification, symbiotic expressions are an effective tool for implementing advanced array optimizations. Symbiotic expressions are also simpler and more likely to be correct than hard-coded analysis, and are flexible and relatively easy to use. Finally, symbiotic expressions are synergistic: they take immediate advantage of new or improved optimizations in the compiler. Symbiotic expressions are a useful addition to a compiler writer's toolkit, giving the compiler a restricted subset of the analysis power of an SMT solver.

Compilation of Modelica Array Computations into Single Assignment C for Efficient Execution on Cuda-enabled Gpu Kristian Stavåker, Daniel Rolls, Jing Guo, Peter Fritzson and Sven-Bodo Scholz (oct 2010). In 3rd International Workshop on Equation-Based Object-Oriented Languages and Tools, Oslo, Norway. 2010_2.pdf BibTexAbstract

@InProceedings{RollSchoJoslStavetalEOOLT10,
  author    = {Kristian Stavåker and Daniel Rolls and Jing Guo and Peter Fritzson and Sven-Bodo Scholz},
  title     = {Compilation of Modelica Array Computations into Single Assignment C for Efficient Execution on Cuda-enabled Gpu},
  booktitle = {3rd International Workshop on Equation-Based Object-Oriented Languages and Tools, Oslo, Norway},
  year      = {2010},
  month     = oct,
  abstract  = {Mathematical models, derived for example from discretisation of partial differential equations, often contain operations over large arrays. In this work we investigate the possibility of compiling array operations from models in the equation-based language Modelica into Single Assignment C (SAC). The SAC2C SAC compiler can generate highly efficient code that, for instance, can be executed on CUDAenabled GPUs. We plan to enhance the open-source Modelica compiler OpenModelica, with capabilities to detect and compile data parallel Modelica for-equations/arrayequations into SAC WITH-loops. As a first step we demonstrate the feasibility of this approach by manually inserting calls to SAC array operations in the code generated from OpenModelica and show how capabilities and runtimes can be extended. As a second step we demostrate the feasibility of rewriting parts of the OpenModelica simulation runtime system in SAC. Finally, we discuss SAC2C019s switchable target architectures and demonstrate one by harnessing a CUDA-enabled GPU to improve runtimes. To the best of our knowledge, compilation of Modelica array operations for execution on CUDA-enabled GPUs is a new research area.},
  affil     = {ctca},
  category  = {core,opt},
  topics    = {SAC, cuda, Modelica, Openmodelica},
  url       = {2010_2.pdf},
}

Mathematical models, derived for example from discretisation of partial differential equations, often contain operations over large arrays. In this work we investigate the possibility of compiling array operations from models in the equation-based language Modelica into Single Assignment C (SAC). The SAC2C SAC compiler can generate highly efficient code that, for instance, can be executed on CUDAenabled GPUs. We plan to enhance the open-source Modelica compiler OpenModelica, with capabilities to detect and compile data parallel Modelica for-equations/arrayequations into SAC WITH-loops. As a first step we demonstrate the feasibility of this approach by manually inserting calls to SAC array operations in the code generated from OpenModelica and show how capabilities and runtimes can be extended. As a second step we demostrate the feasibility of rewriting parts of the OpenModelica simulation runtime system in SAC. Finally, we discuss SAC2C019s switchable target architectures and demonstrate one by harnessing a CUDA-enabled GPU to improve runtimes. To the best of our knowledge, compilation of Modelica array operations for execution on CUDA-enabled GPUs is a new research area.

2009

Symbiotic Expressions Robert Bernecky, Stephan Herhut and Sven-Bodo Scholz (2009). In 21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA. Seton Hall University, South Orange, NJ, USA.. 2009_1.pdf BibTexAbstract

@InProceedings{BernHerhSchoIFL09draft,
  author    = {Robert Bernecky and Stephan Herhut and Sven-Bodo Scholz},
  title     = {Symbiotic Expressions},
  booktitle = {21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA},
  year      = {2009},
  number    = {{SHU-TR-CS-2009-09-1}},
  publisher = {Seton Hall University, South Orange, NJ, USA.},
  affil     = {ctca},
  summary   = {},
  abstract  = {We introduce symbiotic expressions, a method for algebraic simplification within a compiler, in lieu of an SMT solver, such as Yices or the Omega Calculator. Symbiotic expressions are compiler-generated expressions, temporarily injected into a program's abstract syntax tree (AST). The compiler's normal optimizations interpret and simplify those expressions, making their results available for the compiler to use as a basis for decisions about further optimization of the source program. The expressions are symbiotic, in the sense that both parties benefit: an optimization benefits, by using the compiler itself to simplify expressions that have been attached, lamprey-like, to the AST by the optimization; the program being compiled benefits, from improved run-time in both serial and parallel environments. We show the utility of symbiotic expressions by using them to extend the SAC compiler's With-Loop-Folding optimization, currently limited to Arrays of Known Shape (AKS), to Arrays of Known Dimensionality (AKD). We show that, in conjunction with array-based constant-folding, injection and propagation of array extrema, and compiler-based expression simplification, symbiotic expressions are an effective tool for implementing advanced array optimizations. Symbiotic expressions are also simpler and more likely to be correct than hard-coded analysis, and are flexible and relatively easy to use. Finally, symbiotic expressions are synergistic: they take immediate advantage of new or improved optimizations in the compiler. Symbiotic expressions are a useful addition to a compiler writer's toolkit, giving the compiler a restricted subset of the analysis power of an SMT solver.},
  category  = {opt},
  DOI       = {},
  URL       = {2009_1.pdf},
}

We introduce symbiotic expressions, a method for algebraic simplification within a compiler, in lieu of an SMT solver, such as Yices or the Omega Calculator. Symbiotic expressions are compiler-generated expressions, temporarily injected into a program's abstract syntax tree (AST). The compiler's normal optimizations interpret and simplify those expressions, making their results available for the compiler to use as a basis for decisions about further optimization of the source program. The expressions are symbiotic, in the sense that both parties benefit: an optimization benefits, by using the compiler itself to simplify expressions that have been attached, lamprey-like, to the AST by the optimization; the program being compiled benefits, from improved run-time in both serial and parallel environments. We show the utility of symbiotic expressions by using them to extend the SAC compiler's With-Loop-Folding optimization, currently limited to Arrays of Known Shape (AKS), to Arrays of Known Dimensionality (AKD). We show that, in conjunction with array-based constant-folding, injection and propagation of array extrema, and compiler-based expression simplification, symbiotic expressions are an effective tool for implementing advanced array optimizations. Symbiotic expressions are also simpler and more likely to be correct than hard-coded analysis, and are flexible and relatively easy to use. Finally, symbiotic expressions are synergistic: they take immediate advantage of new or improved optimizations in the compiler. Symbiotic expressions are a useful addition to a compiler writer's toolkit, giving the compiler a restricted subset of the analysis power of an SMT solver.

2008

From Contracts Towards Dependent Types: Proofs by Partial Evaluation Stephan Herhut, Sven-Bodo Scholz, Robert Bernecky, Clemens Grelck and Kai Trojahner (2008). In 19th International Symposium on Implementation and Application of Functional Languages (IFL'07), Freiburg, Germany, Revised Selected Papers. pp. 254–273. Springer. DOIfctdt.pdf BibTexAbstract

@InProceedings{HerhSchoBernetalIFL07,
  author     = {Stephan Herhut and Sven-Bodo Scholz and Robert Bernecky and Clemens Grelck and Kai Trojahner},
  title      = {From Contracts Towards Dependent Types: Proofs by Partial Evaluation},
  booktitle  = {19th International Symposium on Implementation and Application of Functional Languages (IFL'07), Freiburg, Germany, Revised Selected Papers},
  year       = {2008},
  editor     = {Olaf Chitil and Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
  volume     = {5083},
  series     = {Lecture Notes in Computer Science},
  pages      = {254--273},
  publisher  = {Springer},
  affil      = {ctca},
  isbn       = {978-3-540-85372-5},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  summary    = {},
  abstract   = {The specification and resolution of non-trivial domain constraints has become a well-recognised measure for improving the stability of large software systems. In this paper we propose an approach based on partial evaluation which tries to prove such constraints statically as far as possible and inserts efficient dynamic checks otherwise.},
  category   = {types, opt},
  doi        = {10.1007/978-3-540-85373-2_15},
  url        = {fctdt.pdf} 
}

The specification and resolution of non-trivial domain constraints has become a well-recognised measure for improving the stability of large software systems. In this paper we propose an approach based on partial evaluation which tries to prove such constraints statically as far as possible and inserts efficient dynamic checks otherwise.

2007

On Optimising Shape-generic Array Programs Using Symbolic Structural Information Kai Trojahner, Clemens Grelck and Sven-Bodo Scholz (2007). In Implementation and Application of Functional Languages, 18th International Symposium (IFL'06), Budapest, Hungary, Revised Selected Papers. pp. 1–18. Springer. DOIoosgapussi.pdf BibTexAbstract

@InProceedings{TrojGrelSchoIFL06,
  author     = {Kai Trojahner and Clemens Grelck and Sven-Bodo Scholz},
  title      = {On Optimising Shape-generic Array Programs Using Symbolic Structural Information},
  booktitle  = {Implementation and Application of Functional Languages, 18th International Symposium (IFL'06), Budapest, Hungary, Revised Selected Papers},
  year       = {2007},
  editor     = {Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
  volume     = {4449},
  series     = {Lecture Notes in Computer Science},
  pages      = {1--18},
  publisher  = {Springer},
  abstract   = { Shape-generic programming and high run time performance do match if generic source code is systematically specialised into nongeneric executable code. However, as soon as we drop the assumption of whole-world knowledge or refrain from specialisation for other reasons, compiled generic code is substantially less efficient. Limited effectiveness of code optimisation techniques due to the inherent lack of knowledge about the structural properties of arrays can be identified as the single most important source of inefficiency. However, in many cases partial structural information or structural relationships between arrays would actually suffice for optimisation. We propose symbolic array attributes as a uniform scheme to infer and to represent partial and relational structural information in shape-generic array code. By reusing the regular language to express structural properties in intermediate code, existing optimisations benefit from symbolic array attributes with little or no alteration. In fact, program optimisation and identification of structural properties cross-fertilise each other. We outline our approach in the context of the functional array language SaC and demonstrate its effectiveness by a small case study.},
  affil      = {ctca},
  category   = {core, opt},
  doi        = {10.1007/978-3-540-74130-5_},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  url        = {oosgapussi.pdf},
}

Shape-generic programming and high run time performance do match if generic source code is systematically specialised into nongeneric executable code. However, as soon as we drop the assumption of whole-world knowledge or refrain from specialisation for other reasons, compiled generic code is substantially less efficient. Limited effectiveness of code optimisation techniques due to the inherent lack of knowledge about the structural properties of arrays can be identified as the single most important source of inefficiency. However, in many cases partial structural information or structural relationships between arrays would actually suffice for optimisation. We propose symbolic array attributes as a uniform scheme to infer and to represent partial and relational structural information in shape-generic array code. By reusing the regular language to express structural properties in intermediate code, existing optimisations benefit from symbolic array attributes with little or no alteration. In fact, program optimisation and identification of structural properties cross-fertilise each other. We outline our approach in the context of the functional array language SaC and demonstrate its effectiveness by a small case study.

Index Vector Elimination: Making Index Vectors Affordable Robert Bernecky, Stephan Herhut, Sven-Bodo Scholz, Kai Trojahner, Clemens Grelck and Alex Shafarenko (2007). In Implementation and Application of Functional Languages, 18th International Symposium (IFL'06), Budapest, Hungary, Revised Selected Papers. pp. 19–36. Springer. DOIivemiva.pdf BibTexAbstract

@InProceedings{BernHerhSchoetalIFL06,
  author     = {Robert Bernecky and Stephan Herhut and Sven-Bodo Scholz and Kai Trojahner and Clemens Grelck and Alex Shafarenko},
  title      = {Index Vector Elimination: Making Index Vectors Affordable},
  booktitle  = {Implementation and Application of Functional Languages, 18th International Symposium (IFL'06), Budapest, Hungary, Revised Selected Papers},
  year       = {2007},
  editor     = {Zoltan Horv\'ath and Vikt\'oria Zs\'ok and Andrew Butterfield},
  volume     = {4449},
  series     = {Lecture Notes in Computer Science},
  pages      = {19--36},
  publisher  = {Springer},
  affil      = {ctca},
  contents   = {[ISBN 978-3-540-74129-9]},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  summary    = {},
  abstract   = {Compiling indexing operations on n-dimensional arrays into efficiently executable code is a challenging task. This paper focuses on the reduction of offset computations as they typically occur when transforming index vectors into offsets for linearized representations of ndimensional arrays. We present a high-level optimization to that effect which is generally applicable, even in the presence of statically unknown rank (n). Our experiments show run-time improvements between a factor of 2 and 16 on a set of real-world benchmarks.},
  category   = {opt},
  doi        = {10.1007/978-3-540-74130-5_2},
  url        = {ivemiva.pdf} 
}

Compiling indexing operations on n-dimensional arrays into efficiently executable code is a challenging task. This paper focuses on the reduction of offset computations as they typically occur when transforming index vectors into offsets for linearized representations of ndimensional arrays. We present a high-level optimization to that effect which is generally applicable, even in the presence of statically unknown rank (n). Our experiments show run-time improvements between a factor of 2 and 16 on a set of real-world benchmarks.

2006

Shape Cliques Robert Bernecky (2006). In 18th International Symposium on Implementation and Application of Functional Languages (IFL'06), Budapest, Hungary. pp. 1–12. Eötvös Loránd University, Faculty of Informatics, Budapest, Hungary. sc.pdf BibTexAbstract

@inproceedings{BerneckyIFL06,
  title        = {Shape Cliques},
  author       = {Robert Bernecky},
  year         = 2006,
  booktitle    = {18th International Symposium on Implementation and Application of Functional Languages (IFL'06), Budapest, Hungary},
  publisher    = {E\"otv\"os Lor\'and University, Faculty of Informatics, Budapest, Hungary},
  series       = {Technical Report 2006-S01},
  pages        = {1--12},
  editor       = {Zoltan Horv\'ath and Vikt\'oria Zs\'ok},
  topics       = {SAC,APEX,APL,Shape Cliques,Shape Inference},
  abstract     = {We introduce shape cliques, a simple way to organize a subset of the arrays appearing in an array-language-based application into sets of identically shaped arrays- shape cliques- and show how a compiler can analyze an application to infer membership in those cliques. We describe an algorithm for performing shape clique inference (SCI), and demonstrate that shape cliques can improve the performance of generated code, by permitting extension of an optimization for removal of run-time checks, and by extending the set of arrays to which optimizations, such as Index Vector Elimination (IVE), can be applied. Implementation of SCI in the APEX APL compiler permitted removal of 25 \% of run-time checks remaining on 156 benchmarks remaining after other compiler optimizations had eliminated 72 \% of the 1251 checks present in the original code. In the SAC compiler, IVE using SCI produced typical speedups of 2--14X on benchmarks operating on arrays of non- xed rank and shape, compared to the operation of IVE in a non-SCI environment. Shape clique inference data can be exploited to allow certain other optimizations, such as loop fusion and with-loop folding, to be performed on arrays of statically unknown shape and rank, with the potential for signi cant reductions in execution time.},
  category   = {opt},
  doi        = {},
  url        = {sc.pdf},
}

We introduce shape cliques, a simple way to organize a subset of the arrays appearing in an array-language-based application into sets of identically shaped arrays- shape cliques- and show how a compiler can analyze an application to infer membership in those cliques. We describe an algorithm for performing shape clique inference (SCI), and demonstrate that shape cliques can improve the performance of generated code, by permitting extension of an optimization for removal of run-time checks, and by extending the set of arrays to which optimizations, such as Index Vector Elimination (IVE), can be applied. Implementation of SCI in the APEX APL compiler permitted removal of 25 % of run-time checks remaining on 156 benchmarks remaining after other compiler optimizations had eliminated 72 % of the 1251 checks present in the original code. In the SAC compiler, IVE using SCI produced typical speedups of 2–14X on benchmarks operating on arrays of non- xed rank and shape, compared to the operation of IVE in a non-SCI environment. Shape clique inference data can be exploited to allow certain other optimizations, such as loop fusion and with-loop folding, to be performed on arrays of statically unknown shape and rank, with the potential for signi cant reductions in execution time.

Merging Compositions of Array Skeletons in Sac Clemens Grelck and Sven-Bodo Scholz (2006). Journal of Parallel Computing 32 (7+8) pp. 507–522. DOImcoasis.pdf BibTexAbstract

@Article{GrelSchoPARCO06,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Merging Compositions of Array Skeletons in Sac},
  journal   = {Journal of Parallel Computing},
  year      = {2006},
  volume    = {32},
  number    = {7+8},
  pages     = {507--522},
  abstract  = {The design of skeletons for expressing concurrent computations usually faces a conflict between software engineering demands and performance issues. Whereas the former favour versatile fine-grain skeletons that can be successively combined into larger programs, coarse-grain skeletons are more desirable from a performance perspective. We describe a way out of this dilemma for array skeletons. In the functional array language SaC we internally represent individual array skeletons by one or more meta skeletons, called with-loops. The design of with-loops is carefully chosen to be versatile enough to cope with a large variety of skeletons, yet to be simple enough to allow for compilation into efficiently executable (parallel) code. Furthermore, with-loops are closed with respect to three tailor-made optimisation techniques, that systematically transform compositions of simple, computationally light weight skeletons into few complex and computationally heavier-weight skeletons.},
  affil     = {ctca},
  category  = {opt},
  contents  = {[ISSN 0167-8191},
  doi       = {10.1016/j.parco.2006.08.003},
  sourceurl = {http://dx.doi.org/10.1016/j.parco.2006.08.003},
  topics    = {SAC},
  url       = {mcoasis.pdf},
}

The design of skeletons for expressing concurrent computations usually faces a conflict between software engineering demands and performance issues. Whereas the former favour versatile fine-grain skeletons that can be successively combined into larger programs, coarse-grain skeletons are more desirable from a performance perspective. We describe a way out of this dilemma for array skeletons. In the functional array language SaC we internally represent individual array skeletons by one or more meta skeletons, called with-loops. The design of with-loops is carefully chosen to be versatile enough to cope with a large variety of skeletons, yet to be simple enough to allow for compilation into efficiently executable (parallel) code. Furthermore, with-loops are closed with respect to three tailor-made optimisation techniques, that systematically transform compositions of simple, computationally light weight skeletons into few complex and computationally heavier-weight skeletons.

With-loop Fusion for Data Locality and Parallelism Clemens Grelck, Karsten Hinckfuß and Sven-Bodo Scholz (2006). In Implementation and Application of Functional Languages, 17th International Workshop (IFL'05), Dublin, Ireland, Revised Selected Papers. pp. 178–195. Springer. DOIwlffdlap.pdf BibTexAbstract

@InProceedings{GrelHincSchoIFL05,
  author     = {Clemens Grelck and Karsten Hinckfuß and Sven-Bodo Scholz},
  title      = {With-loop Fusion for Data Locality and Parallelism},
  booktitle  = {Implementation and Application of Functional Languages, 17th International Workshop (IFL'05), Dublin, Ireland, Revised Selected Papers},
  year       = {2006},
  editor     = {Andrew Butterfield},
  volume     = {4015},
  series     = {Lecture Notes in Computer Science},
  pages      = {178--195},
  publisher  = {Springer},
  abstract   = {With are versatile array comprehensions used in the functional array language SaC to implement aggregate array operations that are applicable to arrays of any rank and shape. We describe the fusion of with as a novel optimisation technique to improve both the data locality of compiled code in general and the synchronisation behaviour of compiler-parallelised code in particular. Some experiments demonstrate the impact of With-loop-fusion on the runtime performance of compiled SaC code.},
  affil      = {ctca},
  category   = {opt},
  doi        = {10.1007/11964681_11},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  url        = {wlffdlap.pdf},
}

With are versatile array comprehensions used in the functional array language SaC to implement aggregate array operations that are applicable to arrays of any rank and shape. We describe the fusion of with as a novel optimisation technique to improve both the data locality of compiled code in general and the synchronisation behaviour of compiler-parallelised code in particular. Some experiments demonstrate the impact of With-loop-fusion on the runtime performance of compiled SaC code.

A Binding Scope Analysis for Generic Programs on Arrays Clemens Grelck, Sven-Bodo Scholz and Alex Shafarenko (2006). In Implementation and Application of Functional Languages, 17th International Workshop (IFL'05), Dublin, Ireland, Revised Selected Papers. pp. 212–230. Springer. DOIabsafgpoa.pdf BibTexAbstract

@InProceedings{GrelSchoShafIFL05,
  author     = {Clemens Grelck and Sven-Bodo Scholz and Alex Shafarenko},
  title      = {A Binding Scope Analysis for Generic Programs on Arrays},
  booktitle  = {Implementation and Application of Functional Languages, 17th International Workshop (IFL'05), Dublin, Ireland, Revised Selected Papers},
  year       = {2006},
  editor     = {Andrew Butterfield},
  volume     = {4015},
  series     = {Lecture Notes in Computer Science},
  pages      = {212--230},
  publisher  = {Springer},
  affil      = {ctca},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  summary    = {},
  abstract   = {Performance of generic array programs crucially relies on program specialisation wrt. shape information. Traditionally, this is done in a rather ad hoc fashion by propagating all shape information that is available. When striving for a compositional programming style that adheres to good software engineering principles this approach turns out to be insufficient. Instead, static value information needs to be propagated as well which introduces all the well known problems of partial evaluation in general. In this paper, we propose a static analysis that identifies to what extent specialisation needs to be employed in order to achieve a certain level of shape information. This narrows the scope of specialisation far enough to make specialisation for shape information feasible despite a compositional programming style. Some examples to this effect are presented.},
  category   = {opt},
  doi        = {10.1007/11964681_13},
  url        = {absafgpoa.pdf},
}

Performance of generic array programs crucially relies on program specialisation wrt. shape information. Traditionally, this is done in a rather ad hoc fashion by propagating all shape information that is available. When striving for a compositional programming style that adheres to good software engineering principles this approach turns out to be insufficient. Instead, static value information needs to be propagated as well which introduces all the well known problems of partial evaluation in general. In this paper, we propose a static analysis that identifies to what extent specialisation needs to be employed in order to achieve a certain level of shape information. This narrows the scope of specialisation far enough to make specialisation for shape information feasible despite a compositional programming style. Some examples to this effect are presented.

2005

A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o C. Grelck, S. Kuthe and S.-B. Scholz (2005). In 3rd Workshop on High-Level Parallel Programming and Applications (HLPP'05), Warwick, UK. pp. 147–160. University of Warwick, UK. DOIahsmemfadpl.pdf BibTexAbstract

@InProceedings{GrelKuthSchoHLPP05,
  author    = {C. Grelck and S. Kuthe and S.-B. Scholz},
  title     = {A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o},
  booktitle = {3rd Workshop on High-Level Parallel Programming and Applications (HLPP'05), Warwick, UK},
  year      = {2005},
  editor    = {A. Tiskin},
  pages     = {147--160},
  publisher = {University of Warwick, UK},
  abstract  = {We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.},
  affil     = {ctca},
  category  = {design, opt},
  doi       = {10.1142/S012962640800320X},
  topics    = {SAC},
  url       = {ahsmemfadpl.pdf},
}

We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.

2004

Implicit Memory Management for Sac Clemens Grelck and Kai Trojahner (2004). In Implementation and Application of Functional Languages, 16th International Workshop, IFL'04. pp. 335–348. University of Kiel, Institute of Computer Science and Applied Mathematics. greltrojifl04.pdf BibTexAbstract

@InProceedings{GrelTrojIFL04,
  author    = {Clemens Grelck and Kai Trojahner},
  title     = {Implicit Memory Management for Sac},
  booktitle = {Implementation and Application of Functional Languages, 16th International Workshop, IFL'04},
  year      = {2004},
  editor    = {Clemens Grelck and Frank Huch},
  pages     = {335--348},
  publisher = {University of Kiel, Institute of Computer Science and Applied Mathematics},
  note      = {Technical Report 0408},
  abstract  = {While almost all functional languages rely on garbage col- lection for implicit memory management, the needs of efficient array processing are better suited by reference counting. The opportunities to reclaim unused memory immediately and to implement functionally sound array operations by destructive in-place updates turn out to be essential for approaching the performance achieved by imperative lan- guages. In this paper we outline the realization of implicit memory management in the functional array language SaC. Starting with basic schemes for the introduction of memory management and reference counting instructions into SaC code, the emphasis is laid on a range of optimizations that aim at reducing runtime overhead and exploiting memory reuse opportuni- ties.},
  category  = {design, opt},
  topics    = {SAC},
  url       = {greltrojifl04.pdf},
}

While almost all functional languages rely on garbage col- lection for implicit memory management, the needs of efficient array processing are better suited by reference counting. The opportunities to reclaim unused memory immediately and to implement functionally sound array operations by destructive in-place updates turn out to be essential for approaching the performance achieved by imperative lan- guages. In this paper we outline the realization of implicit memory management in the functional array language SaC. Starting with basic schemes for the introduction of memory management and reference counting instructions into SaC code, the emphasis is laid on a range of optimizations that aim at reducing runtime overhead and exploiting memory reuse opportuni- ties.

With-loop Scalarization: Merging Nested Array Operations Clemens Grelck, Sven-Bodo Scholz and Kai Trojahner (2004). In Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers. Springer. DOIwlsmnao.pdf BibTexAbstract

@InProceedings{GrelSchoTrojIFL03,
  author     = {Clemens Grelck and Sven-Bodo Scholz and Kai Trojahner},
  title      = {With-loop Scalarization: Merging Nested Array Operations},
  booktitle  = {Implementation of Functional Languages, 15th International Workshop (IFL'03), Edinburgh, Scotland, UK, Revised Selected Papers},
  year       = {2004},
  editor     = {Phil Trinder and Greg Michaelson},
  volume     = {3145},
  series     = {Lecture Notes in Computer Science},
  publisher  = {Springer},
  abstract   = {Construction of complex array operations by composition of more basic ones allows for abstract and concise specifications of algorithms. Unfortunately, na¨ıve compilation of such specifications leads to creation of many temporary arrays at runtime and, consequently, to poor performance characteristics. This paper elaborates on a new compiler optimization, named withloop-scalarization, which aims at eliminating temporary arrays in the context of nested array operations. It is based on with-loops, a versatile array comprehension construct used by the functional array language SaC both for specification as well as for internal representation of array operations. The impact of with-loop scalarization on the runtime performance of compiled SaC code is demonstrated by several experiments involving support for arithmetic on arrays of complex numbers and the application kernel FT from the NAS benchmark suite.},
  category   = {design, opt},
  doi        = {10.1007/978-3-540-27861-0_8},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries},
  url        = {WLSMNAO.pdf},
}

Construction of complex array operations by composition of more basic ones allows for abstract and concise specifications of algorithms. Unfortunately, na¨ıve compilation of such specifications leads to creation of many temporary arrays at runtime and, consequently, to poor performance characteristics. This paper elaborates on a new compiler optimization, named withloop-scalarization, which aims at eliminating temporary arrays in the context of nested array operations. It is based on with-loops, a versatile array comprehension construct used by the functional array language SaC both for specification as well as for internal representation of array operations. The impact of with-loop scalarization on the runtime performance of compiled SaC code is demonstrated by several experiments involving support for arithmetic on arrays of complex numbers and the application kernel FT from the NAS benchmark suite.

2003

A Multithreaded Compiler Backend for High-level Array Programming Clemens Grelck (2003). In 2nd International Conference on Parallel and Distributed Computing and Networks (PDCN'03), Innsbruck, Austria. pp. 478–484. ACTA Press. DOIamcbfhlap.pdf BibTexAbstract

@InProceedings{GrelckPDCN03,
  author     = {Clemens Grelck},
  title      = {A Multithreaded Compiler Backend for High-level Array Programming},
  booktitle  = {2nd International Conference on Parallel and Distributed Computing and Networks (PDCN'03), Innsbruck, Austria},
  year       = {2003},
  editor     = {Mohammed H. Hamza},
  pages      = {478--484},
  publisher  = {ACTA Press},
  abstract   = {Whenever large homogeneous data structures need to be processed in a non-trivial way, e.g. in computational sciences, image processing, or system simulation, high-level array programming in the style of APL offers a far more concise and abstract approach than traditional scalar languages such as C/C++ or FORTRAN-77. The same sort of applications often can also be characterized as performance critical and today represents the major domain for parallel processing. This paper reports on the development of a compiler backend which allows to implicitly generate multithreaded code from high-level array program specifications. On shared memory multiprocessor systems, this code can be executed in parallel without any additional programming effort. After sketching out basic compilation schemes, optimizations on the runtime system are addressed and, finally, experimental runtime figures are presented.},
  category   = {core, par, opt},
  contents   = {[ISBN 0-88986-341-5],[ISSN 1027-2666]},
  doi        = {10.1.1.408.8553},
  pubaddress = {Anaheim, CA, USA},
  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/mt-backend-innsbruck-03.ps.gz},
  topics     = {SAC,Shared Memory},
  url        = {AMCBFHLAP.pdf},
}

Whenever large homogeneous data structures need to be processed in a non-trivial way, e.g. in computational sciences, image processing, or system simulation, high-level array programming in the style of APL offers a far more concise and abstract approach than traditional scalar languages such as C/C++ or FORTRAN-77. The same sort of applications often can also be characterized as performance critical and today represents the major domain for parallel processing. This paper reports on the development of a compiler backend which allows to implicitly generate multithreaded code from high-level array program specifications. On shared memory multiprocessor systems, this code can be executed in parallel without any additional programming effort. After sketching out basic compilation schemes, optimizations on the runtime system are addressed and, finally, experimental runtime figures are presented.

Axis Control in Sac Clemens Grelck and Sven-Bodo Scholz (2003). In Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers. pp. 182–198. Springer. DOIacis.pdf BibTexAbstract

@InProceedings{GrelSchoIFL02,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Axis Control in Sac},
  booktitle  = {Implementation of Functional Languages, 14th International Workshop (IFL'02), Madrid, Spain, Revised Selected Papers},
  year       = {2003},
  editor     = {Ricardo Pe{\~n}a and Thomas Arts},
  volume     = {2670},
  series     = {Lecture Notes in Computer Science},
  pages      = {182--198},
  publisher  = {Springer},
  abstract   = {High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.},
  category   = {core, opt},
  doi        = {10.1.1.540.8938},
  pubaddress = {Berlin, Heidelberg, Germany},
  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/axis-control-madrid-02.ps.gz},
  topics     = {SAC},
  url        = {ACIS.pdf},
}

High-level array processing is characterized by the composi-tion of generic operations, which treat all array elements in a uniform way. This paper proposes a mechanism that allows programmers to direct effects of such array operations to non-scalar subarrays of argument ar-rays without sacrificing the high-level programming approach. A versatile notation for axis control is presented, and it is shown how the additional language constructs can be transformed into regular SaC code. Further-more, an optimization technique is introduced which achieves the same runtime performance regardless of whether code is written using the new notation or in a substantially less elegant style employing conventional language features.

2002

A Compilation Scheme for a Hierarchy of Array Types Dietmar Kreye (2002). In Implementation of Functional Languages, 13th International Workshop (IFL'01), Stockholm, Sweden, Selected Papers. pp. 18–35. Springer. DOIacsfahoat.pdf BibTexAbstract

@InProceedings{KreyeIFL01,
  author     = {Dietmar Kreye},
  title      = {A Compilation Scheme for a Hierarchy of Array Types},
  booktitle  = {Implementation of Functional Languages, 13th International Workshop (IFL'01), Stockholm, Sweden, Selected Papers},
  year       = {2002},
  editor     = {Thomas Arts and Markus Mohnen},
  volume     = {2312},
  series     = {Lecture Notes in Computer Science},
  pages      = {18--35},
  publisher  = {Springer},
  pubaddress = {Berlin, Heidelberg, Germany},
  sourceurl  = {http://www.sac-home.org/publications/sac2c-comp-hierarchy-stockholm-01.ps.gz},
  topics     = {SAC,Implementation of Arrays},
  summary    = {},
  abstract   = {In order to achieve a high level of abstraction, array-oriented languages provide language constructs for defining array operations in a shape-invariant way. However, when trying to compile such generic array operations into efficiently executable code, static knowledge of exact shapes is essential. Therefore, modern compilers try to infer the shapes of all arrays used in a program. Unfortunately, shape inference is generally undecidable. Therefore, most compilers either rule out all programs for which shape inference fails, or they perform no shape inference at all. In the first case the expressive power of the language is restricted, in the latter the generated code has a weak runtime performance. This paper presents a new compilation scheme for the language Sac which combines these two approaches in order to avoid their individual shortcomings. A preliminary performance evaluation demonstrates the benefits of this compilation scheme.},
  category   = {core, opt},
  doi        = {10.1.1.138.8533},
  url        = {ACSFAHOAT.pdf},
}

In order to achieve a high level of abstraction, array-oriented languages provide language constructs for defining array operations in a shape-invariant way. However, when trying to compile such generic array operations into efficiently executable code, static knowledge of exact shapes is essential. Therefore, modern compilers try to infer the shapes of all arrays used in a program. Unfortunately, shape inference is generally undecidable. Therefore, most compilers either rule out all programs for which shape inference fails, or they perform no shape inference at all. In the first case the expressive power of the language is restricted, in the latter the generated code has a weak runtime performance. This paper presents a new compilation scheme for the language Sac which combines these two approaches in order to avoid their individual shortcomings. A preliminary performance evaluation demonstrates the benefits of this compilation scheme.

2001

DOIarray-data-layout-aachen-00.pdf BibTexAbstract

@InProceedings{GrelckIFL00,
  author     = {Clemens Grelck},
  title      = {Improving Cache Effectiveness through Array Data Layout in Sac},
  booktitle  = {Implementation of Functional Languages, 12th International Workshop (IFL'00), Aachen, Germany, Selected Papers},
  year       = {2001},
  editor     = {Markus Mohnen and Pieter Koopman},
  volume     = {2011},
  series     = {Lecture Notes in Computer Science},
  pages      = {231--248},
  publisher  = {Springer},
  abstract   = {SAC is a functional array processing language particularly designed with numerical applications in mind. In this field the runtime performance of programs critically depends on the efficient utilization of the memory hierarchy. Cache conflicts due to limited set associativity are one relevant source of inefficiency. This paper describes the realization of an optimization technique which aims at eliminating cache conflicts by adjusting the data layout of arrays to specific access patterns and cache configurations. Its effect on cache utilization and runtime performance is demonstrated by investigations on the PDE1 benchmark.},
  category   = {core,opt},
  doi        = {10.1007/3-540-45361-X_14},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper describes the implementation of implicit array padding as a way to resolve self-interference cache conflicts in SAC programs. Besides an improved inference heuristic, array access pattern analysis and the realization of padding as a high-level program transformation on intermediate SAC code are outlined.},
  topics     = {SAC,Array Data Layout},
  url        = {array-data-layout-aachen-00.pdf},
}

SAC is a functional array processing language particularly designed with numerical applications in mind. In this field the runtime performance of programs critically depends on the efficient utilization of the memory hierarchy. Cache conflicts due to limited set associativity are one relevant source of inefficiency. This paper describes the realization of an optimization technique which aims at eliminating cache conflicts by adjusting the data layout of arrays to specific access patterns and cache configurations. Its effect on cache utilization and runtime performance is demonstrated by investigations on the PDE1 benchmark.

2000

array-padding-lasvegas-00.pdf BibTexAbstract

@InProceedings{GrelckPDPTA00,
  author    = {Clemens Grelck},
  title     = {Array Padding in the Functional Language Sac},
  booktitle = {International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'00), Las Vegas, Nevada, USA},
  year      = {2000},
  editor    = {Hamid R. Arabnia},
  volume    = {5},
  pages     = {2553--2560},
  publisher = {CSREA Press, Athens, Georgia, USA},
  abstract  = {Sac is a functional array processing language that tries to combine generic, high-level program specifications with efficient runtime behavior. Being particularly designed for numerical applications, runtime performance critically depends on the effective utilization of the memory hierarchy. For many programs, however, it can be observed that the achieved performance significantly changes with small variations in the problem size. Array padding is a well-known optimization technique that adjusts the data layout of arrays in order to make better usage of caches. The paper presents an algorithm that derives a customized data layout from an array access pattern and a cache specification. Cache phenomena such as spatial and temporal reuse are taken into account as well as different cache architectures. The effectiveness is demonstrated by investigations on the runtime performance of the PDE1 benchmark on a shared memory multiprocessor.},
  category  = {core,opt,par},
  summary   = {This paper describes an inference strategy to identify suitable pad sizes for intra-array padding in order to eliminate self interference cache conflicts in numerical algorithms. Its performance impact in particular with respect to multithreaded program execution is shown by means of runtime figures.},
  topics    = {SAC,Array Data Layout},
  url       = {array-padding-lasvegas-00.pdf},
}

Sac is a functional array processing language that tries to combine generic, high-level program specifications with efficient runtime behavior. Being particularly designed for numerical applications, runtime performance critically depends on the effective utilization of the memory hierarchy. For many programs, however, it can be observed that the achieved performance significantly changes with small variations in the problem size. Array padding is a well-known optimization technique that adjusts the data layout of arrays in order to make better usage of caches. The paper presents an algorithm that derives a customized data layout from an array access pattern and a cache specification. Cache phenomena such as spatial and temporal reuse are taken into account as well as different cache architectures. The effectiveness is demonstrated by investigations on the runtime performance of the PDE1 benchmark on a shared memory multiprocessor.

1999

DOImt-support-london-98.pdf BibTexAbstract

@InProceedings{GrelckIFL98,
  author     = {Clemens Grelck},
  title      = {Shared Memory Multiprocessor Support for Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {38--54},
  publisher  = {Springer},
  abstract   = {Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.},
  category   = {core,design,par,opt},
  doi        = {10.1007/3-540-48515-5_3},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper introduces the basic concepts of multi-threaded program execution on shared memory multiprocessor systems. The described approach is completely implicit, i.e. the compiler and the runtime system handle everything from granularity to load balancing on their own. The paper also contains preliminary performance evaluations.},
  topics     = {SAC,Shared Memory},
  url        = {mt-support-london-98.pdf},
}

Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.

DOIwlf-exp-london-98.pdf BibTexAbstract

@InProceedings{ScholzIFL98,
  author     = {Sven-Bodo Scholz},
  title      = {A Case Study: Effects of With-loop Folding on the Nas Benchmark Mg in Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {216--228},
  publisher  = {Springer},
  abstract   = {Sac is a functional C variant with efficient support for high-level array operations. This paper investigates the applicability of a Sac specific optimization technique called WITH-loop-folding to real world applications. As an example program which originates from the Numerical Aerodynamic Simulation (NAS) Program developed at NASA Ames Research Center, the so-called NAS benchmark MG is chosen. It comprises a kernel from the NAS Program which implements 3-dimensional multigrid relaxation. Several run-time measurements exploit two different benefits of withloop-folding: First, an overall speed-up of about 20% can be observed. Second, a comparison between the run-times of a hand-optimized specification and of APL-like specifications yields identical run-times, although a naive compilation that does not apply With-loop-folding leads to slowdowns of more than an order of magnitude. Furthermore, With-loopfolding makes a slight variation of the algorithm feasible which substantially simplifies the program specification and requires less memory during execution. Finally, the optimized run-times are compared against run-times gained from the original Fortran program, which shows that for different problem sizes, the code generated from the Sac program does not only reach the execution times of the code generated from the Fortran program but even outperforms them by about 10%.},
  category   = {apps,core,opt},
  doi        = {10.1007/3-540-48515-5_14},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper investigates the impact of high-level program transformations, WITH-loop folding in particular, on the style of program specifications and the runtime performance of programs.},
  topics     = {SAC,Avoiding 1Temporaries,NAS},
  url        = {wlf-exp-london-98.pdf},
}

Sac is a functional C variant with efficient support for high-level array operations. This paper investigates the applicability of a Sac specific optimization technique called WITH-loop-folding to real world applications. As an example program which originates from the Numerical Aerodynamic Simulation (NAS) Program developed at NASA Ames Research Center, the so-called NAS benchmark MG is chosen. It comprises a kernel from the NAS Program which implements 3-dimensional multigrid relaxation. Several run-time measurements exploit two different benefits of withloop-folding: First, an overall speed-up of about 20% can be observed. Second, a comparison between the run-times of a hand-optimized specification and of APL-like specifications yields identical run-times, although a naive compilation that does not apply With-loop-folding leads to slowdowns of more than an order of magnitude. Furthermore, With-loopfolding makes a slight variation of the algorithm feasible which substantially simplifies the program specification and requires less memory during execution. Finally, the optimized run-times are compared against run-times gained from the original Fortran program, which shows that for different problem sizes, the code generated from the Sac program does not only reach the execution times of the code generated from the Fortran program but even outperforms them by about 10%.

1998

With-loop-folding in Sac — Condensing Consecutive Array Operations Sven-Bodo Scholz (1998). In Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers. pp. 72–92. Springer. DOIwlf-st-andrews-97.pdf BibTexAbstract

@InProceedings{ScholzIFL1997,
  author     = {Sven-Bodo Scholz},
  title      = {With-loop-folding in Sac --- Condensing Consecutive Array Operations},
  booktitle  = {Implementation of Functional Languages, 9th International Workshop (IFL'97), St. Andrews, UK, Selected Papers},
  year       = {1998},
  editor     = {Chris Clack and Tony Davie and Kevin Hammond},
  volume     = {1467},
  series     = {Lecture Notes in Computer Science},
  pages      = {72--92},
  publisher  = {Springer},
  abstract   = {This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.},
  category   = {core,design,opt},
  doi        = {10.1007/BFb0055425},
  isbn       = {978-3-540-64849-9},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC,Avoiding Temporaries,Implementation of Arrays},
  url        = {wlf-st-andrews-97.pdf},
}

This paper introduces a new compiler optimization called With-loop-folding. It is based on a special loop construct, the with-loop, which in the functional language SAC (for Single Assignment C) serves as a versatile vehicle to describe array operations on an elementwise basis. A general mechanism for combining two of these With-loops into a single loop construct is presented. This mechanism constitutes a powerful tool when it comes to generate efficiently executable code from high-level array specifications. By means of a few examples it is shown that even complex nestings of array operations similar to those available in Apl can be transformed into single loop operations which are similar to hand-optimized With-loop specifications. As a consequence, the way a complex array operation is combined from primitive array operations does not affect the runtime performance of the compiled code, i.e., the programmer is liberated from the burden to take performance considerations into account when specifying complex array operations.

1997

scientific-applications-sac-bonn-96.pdf BibTexAbstract

@InProceedings{ScholzIFL96,
  author     = {Sven-Bodo Scholz},
  title      = {On Programming Scientific Applications in Sac - a Functional Language Extended by a Subsystem for High-level Array Operations},
  booktitle  = {Implementation of Functional Languages, 8th International Workshop (IFL'96), Bonn, Germany, Selected Papers},
  year       = {1997},
  editor     = {Werner Kluge},
  volume     = {1268},
  series     = {Lecture Notes in Computer Science},
  pages      = {85--104},
  publisher  = {Springer-Verlag},
  abstract   = {This paper discusses some of the pros andc ons of extending a simple functional language called Sac (for Single-Assignment C) by array operations similar to those that are available in APL. The array operations in Sac are based on the ψ-calculus, an algebra of arrays which provides a formalism for specifying and simplifying array operations in terms of index set manipulations. The programming techniques made possible by Sac are demonstrated by means of a functional program for the approximation of numerical solutions of partial differential equations by multigrid relaxation. This application is notonly of practical relevance but also fully exposes the flavors of using high-level array operations. In contrast to specifcations in other languages, e.g. in Fortran or Sisal, the Sac program is well structured, reasonably concise, and - what is most important - invariant against dimensionalities and shapes. However, sophisticated compilation techniques are necessary to avoid - whenever possible - the creation of temporary arrays and to eliminate redundant operations. The paper also includes performance figures for a Sac implementation of the NAS-mgrid-benchmark which are competetive with those of a Sisal implementation.},
  category   = {apps,core,opt},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper describes the high-level programming style intended by SAC. Multi-dimensional multigrid relaxation is investigated as a case study. The runtime performance of the SAC implementation is compared to equivalent Sisal and Fortran specifications both in terms of execution time and memory consumption.},
  topics     = {SAC,Scientific Computation},
  url        = {scientific-applications-sac-bonn-96.pdf},
}

This paper discusses some of the pros andc ons of extending a simple functional language called Sac (for Single-Assignment C) by array operations similar to those that are available in APL. The array operations in Sac are based on the ψ-calculus, an algebra of arrays which provides a formalism for specifying and simplifying array operations in terms of index set manipulations. The programming techniques made possible by Sac are demonstrated by means of a functional program for the approximation of numerical solutions of partial differential equations by multigrid relaxation. This application is notonly of practical relevance but also fully exposes the flavors of using high-level array operations. In contrast to specifcations in other languages, e.g. in Fortran or Sisal, the Sac program is well structured, reasonably concise, and - what is most important - invariant against dimensionalities and shapes. However, sophisticated compilation techniques are necessary to avoid - whenever possible - the creation of temporary arrays and to eliminate redundant operations. The paper also includes performance figures for a Sac implementation of the NAS-mgrid-benchmark which are competetive with those of a Sisal implementation.

2023

Rank-Polymorphism for Shape-Guided Blocking Artjoms Šinkarovs, Thomas Koopman and Sven-Bodo Scholz (2023). In Proceedings of the 11th ACM SIGPLAN International Workshop on Functional High-Performance and Numerical Computing. pp. 1–14. Association for Computing Machinery. New York, NY, USA. DOI BibTexAbstract

@inproceedings{ShinKoopSchoFHPNC23,
 author = {\v{S}inkarovs, Artjoms and Koopman, Thomas and Scholz, Sven-Bodo},
 title = {Rank-Polymorphism for Shape-Guided Blocking},
 year = {2023},
 isbn = {9798400702969},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 artefact = {https://dl.acm.org/do/10.1145/3580402/},
 doi = {10.1145/3609024.3609410},
 abstract = {Many numerical algorithms on matrices or tensors can be formulated in a blocking style which improves performance due to better cache locality. In imperative languages, blocking is achieved by introducing additional layers of loops in a nested fashion alongside with suitable adjustments in index computations. While this process is tedious and error-prone, it is also difficult to implement a generically blocked version that would support arbitrary levels of blocking. At the example of matrix multiply, this paper demonstrates how rank-polymorphic array languages enable the specification of such generically blocked algorithms in a simple, recursive form. The depth of the blocking as well as blocking factors can be encoded in the structure of array shapes. In turn, reshaping arrays makes it possible to switch between blocked and non-blocked arrays. Through rank-polymorphic array combinators, any specification of loop boundaries or explicit index computations can be avoided. Firstly, we propose a dependently-typed framework for rank-polymorphic arrays. We use it to demonstrate that all blocked algorithms can be naturally derived by induction on the argument shapes. Our framework guarantees lack of out-of-bound indexing, and we also prove that all the blocked versions compute the same results as the canonical algorithm. Secondly, we translate our specification to the array language SaC. Not only do we show that we achieve similar conciseness in the implementation, but we also observe good performance of the generated code. We achieve a 7\% improvement compared to the highly-optimised OpenBLAS library, and 3\% compared to Intel’s MKL library when running on a 32-core shared-memory system.},
 booktitle = {Proceedings of the 11th ACM SIGPLAN International Workshop on Functional High-Performance and Numerical Computing},
 pages = {1--14},
 numpages = {14},
 keywords = {Matrix Multiply, Dependent Types, Array Programming, HPC, Rank-Polymorphism},
 location = {Seattle, WA, USA},
 category = {par,apps},
 series = {FHPNC 2023},
}

Many numerical algorithms on matrices or tensors can be formulated in a blocking style which improves performance due to better cache locality. In imperative languages, blocking is achieved by introducing additional layers of loops in a nested fashion alongside with suitable adjustments in index computations. While this process is tedious and error-prone, it is also difficult to implement a generically blocked version that would support arbitrary levels of blocking. At the example of matrix multiply, this paper demonstrates how rank-polymorphic array languages enable the specification of such generically blocked algorithms in a simple, recursive form. The depth of the blocking as well as blocking factors can be encoded in the structure of array shapes. In turn, reshaping arrays makes it possible to switch between blocked and non-blocked arrays. Through rank-polymorphic array combinators, any specification of loop boundaries or explicit index computations can be avoided. Firstly, we propose a dependently-typed framework for rank-polymorphic arrays. We use it to demonstrate that all blocked algorithms can be naturally derived by induction on the argument shapes. Our framework guarantees lack of out-of-bound indexing, and we also prove that all the blocked versions compute the same results as the canonical algorithm. Secondly, we translate our specification to the array language SaC. Not only do we show that we achieve similar conciseness in the implementation, but we also observe good performance of the generated code. We achieve a 7% improvement compared to the highly-optimised OpenBLAS library, and 3% compared to Intel’s MKL library when running on a 32-core shared-memory system.

DOIbeurdenscholzifl22.pdf BibTexAbstract

@inproceedings{BuerSchoIFL22, 
author = {van Buerden, Patrick and Scholz, Sven-Bodo}, 
title = {On Generating Out-Of-Core GPU Code for Multi-Dimensional Array Operations}, 
year = {2023},
isbn = {978-1-4503-9831-2/22/08}, 
publisher = {Association for Computing Machinery}, 
address = {New York, NY, USA}, 
url = {beurdenscholzifl22.pdf}, 
doi = {10.1145/3587216.3587223}, 
abstract = {This paper presents the first results of our experiments for generat- ing CUDA code that streams array operations over the elements of its array arguments from high-level specifications. We look at two classes of memory-bound array operations: map-like operations and iterative stencil computations. We investigate code patterns that stream the arguments of these operations from the host through the GPU and back taking the iterative nature of our experiments into ac- count. We show that this setup does not only enable computations on arrays that are so big that they do not fit into the device memory of a single GPU (hence “out-of-core“), but we also demonstrate that the proposed streaming code outperforms non-streaming code ver- sions even for smaller array sizes. For both application patterns, we observe memory throughputs that are beyond 80% of the hardware capability, irrespective of the problem sizes.},
summary = {This paper shows the power of streaming for GPUs.},
category = {par},
booktitle = {Proceedings of the 34nd Symposium on Implementation and Application of Functional Languages}, 
pages = {??–??}, 
numpages = {12}, 
keywords = {code generation, GPUs, CUDA, streaming, memory management, out-of-core computation}, 
location = {Copenhagen, Denmark}, 
series = {IFL 2022} 
}

This paper presents the first results of our experiments for generat- ing CUDA code that streams array operations over the elements of its array arguments from high-level specifications. We look at two classes of memory-bound array operations: map-like operations and iterative stencil computations. We investigate code patterns that stream the arguments of these operations from the host through the GPU and back taking the iterative nature of our experiments into ac- count. We show that this setup does not only enable computations on arrays that are so big that they do not fit into the device memory of a single GPU (hence “out-of-core“), but we also demonstrate that the proposed streaming code outperforms non-streaming code ver- sions even for smaller array sizes. For both application patterns, we observe memory throughputs that are beyond 80% of the hardware capability, irrespective of the problem sizes.

2022

DOIsinkarovsscholzarray22.pdf BibTexAbstract

@inproceedings{SinkSchoARRAY22,
 author = {\v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
 title = {Parallel Scan as a Multidimensional Array Problem},
 year = {2022},
 isbn = {9781450392693},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {SinkarovsScholzARRAY22.pdf},
 doi = {10.1145/3520306.3534500},
 summary = {The true power of rank-polymorphism at the example of scan.},
 category = {apps, par},
 abstract = {For many algorithms, it is challenging to identify a suitable parallel version, as the design space is typically very large. In this paper we demonstrate how rank-polymorphic array languages can be used as a tool to explore such design spaces through concise high-level specifications. If input data can be organised into a multi-dimensional array, and the algorithm can be stated as a recursive traversal over sub-arrays, array languages offer a lot of expressive power. The reason for this is that array shapes can be used to guide recursive traversals. Conciseness of specifications comes from array reshapes that move the desired elements into canonical hyperplanes. As a case study, we discuss several variants of implementing prefix sums (also known as scans) in SaC. We demonstrate how small code adjustments suffice to change the concurrency pattern exposed to the compiler. It turns out that variability that is typically achieved by generic inductive data types such as binary trees is a special case of what is offered by the array paradigm.},
 booktitle = {Proceedings of the 8th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
 pages = {1–11},
 numpages = {11},
 keywords = {rank polymorphism, array languages, algorithms, parallelism, functional programming, prefix sum},
 location = {San Diego, CA, USA}, series = {ARRAY 2022} 
}

For many algorithms, it is challenging to identify a suitable parallel version, as the design space is typically very large. In this paper we demonstrate how rank-polymorphic array languages can be used as a tool to explore such design spaces through concise high-level specifications. If input data can be organised into a multi-dimensional array, and the algorithm can be stated as a recursive traversal over sub-arrays, array languages offer a lot of expressive power. The reason for this is that array shapes can be used to guide recursive traversals. Conciseness of specifications comes from array reshapes that move the desired elements into canonical hyperplanes. As a case study, we discuss several variants of implementing prefix sums (also known as scans) in SaC. We demonstrate how small code adjustments suffice to change the concurrency pattern exposed to the compiler. It turns out that variability that is typically achieved by generic inductive data types such as binary trees is a special case of what is offered by the array paradigm.

DOI BibTexAbstract

@inproceedings{JansSchoIFL21,
 author = {Janssen, Niek and Scholz, Sven-Bodo},
 title = {On Mapping N-Dimensional Data-Parallelism Efficiently into GPU-Thread-Spaces},
 year = {2022}, isbn = {9781450386449},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 doi = {10.1145/3544885.3544894},
 category = {par},
 summary = {Completely revamped kernel executions based on index space transformations.},
 abstract = {Data-Parallelism on multi-dimensional arrays can be conveniently specified as element-wise computations that are executed across n-dimensional index-spaces. While this at first glance matches very well the concept of n-dimensional thread-spaces as provided by the programming model of GPUs, in practice, a one-to-one correspondence often does not work out. This paper proposes a small set of combinators for mapping multi-dimensional index spaces onto multi-dimensional thread-index spaces suitable for execution on GPUs. For each combinator, we provide an inverse operation, which allows the original indices to be recovered within the individual threads. This setup facilitates arbitrary n-dimensional array computations to be executed on GPUs with arbitrary thread-space constraints provided the overall resources required for the computation do not exceed those of the GPU.},
 booktitle = {33rd Symposium on Implementation and Application of Functional Languages},
 pages = {54–66},
 numpages = {13},
 keywords = {SaC, array transformations, functional languages, GPU, CUDA},
 location = {Nijmegen, Netherlands},
 series = {IFL '21} 
}

Data-Parallelism on multi-dimensional arrays can be conveniently specified as element-wise computations that are executed across n-dimensional index-spaces. While this at first glance matches very well the concept of n-dimensional thread-spaces as provided by the programming model of GPUs, in practice, a one-to-one correspondence often does not work out. This paper proposes a small set of combinators for mapping multi-dimensional index spaces onto multi-dimensional thread-index spaces suitable for execution on GPUs. For each combinator, we provide an inverse operation, which allows the original indices to be recovered within the individual threads. This setup facilitates arbitrary n-dimensional array computations to be executed on GPUs with arbitrary thread-space constraints provided the overall resources required for the computation do not exceed those of the GPU.

2015

DOI BibTexAbstract

@InProceedings{VSSIFL2015,
 author = {Viessmann, Hans-Nikolai and Scholz, Sven-Bodo and Sinkarovs, Artjoms and Bainbridge, Brian and Hamilton, Brian and Flower, Simon},
 title = {Making Fortran Legacy Code More Functional: Using the BGS Geomagnetic Field Modelling System As an Example},
 booktitle = {Proceedings of the 27th Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '15},
 year = {2015},
 isbn = {978-1-4503-4273-5},
 location = {Koblenz, Germany},
 pages = {11:1--11:13},
 articleno = {11},
 numpages = {13},
 doi = {10.1145/2897336.2897348},
 acmid = {2897348},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {eigensystem, foreign function interface, fortran, functional programming, high-performance computing, single-assignment C},
 summary = {This paper presents the support for calling SaC from Fortran and it reports about the experiences in the context of the BGS Geomagnetic Field Modelling System code.},
 abstract  = {This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.},
 category = {core,ffi,apps,par},
}

This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.

Type-driven Data Layouts for Improved Vectorisation Artjoms Šinkarovs and Sven-Bodo Scholz (May 2015). Concurrency and Computation: Practice and Experience 28 (7) pp. 2092–2119. Wiley-Blackwell. DOI BibTexAbstract

@Article{SinkSchoCCPE15,
  author    = {Šinkarovs, Artjoms and Scholz, Sven-Bodo},
  title     = {Type-driven Data Layouts for Improved Vectorisation},
  journal   = {Concurrency and Computation: Practice and Experience},
  year      = {2015},
  volume    = {28},
  number    = {7},
  pages     = {2092--2119},
  month     = {May},
  abstract  = {Vector instructions of modern CPUs are crucially important for the performance of compute-intensive algorithms. Auto-vectorisation often fails because of an unfortunate choice of data layout by the programmer. This paper proposes a data layout inference for auto-vectorisation that identifies layout transformations that convert single instruction, multiple data-unfavourable layouts of data structures into favourable ones. We present a type system for layout transformations, and we sketch an inference algorithm for it. Finally, we present some initial performance figures for the impact of the inferred layout transformations. They show that non-intuitive layouts that are inferred through our system can have a vast performance impact on compute intensive programs.},
  category  = {types,opt,par},
  doi       = {10.1002/cpe.3501},
  issn      = {1532-0626},
  keywords  = {Data parallelism, Hpc, Vectorisation},
  publisher = {Wiley-Blackwell},
}

Vector instructions of modern CPUs are crucially important for the performance of compute-intensive algorithms. Auto-vectorisation often fails because of an unfortunate choice of data layout by the programmer. This paper proposes a data layout inference for auto-vectorisation that identifies layout transformations that convert single instruction, multiple data-unfavourable layouts of data structures into favourable ones. We present a type system for layout transformations, and we sketch an inference algorithm for it. Finally, we present some initial performance figures for the impact of the inferred layout transformations. They show that non-intuitive layouts that are inferred through our system can have a vast performance impact on compute intensive programs.

2013

Towards Heterogeneous Computing without Heterogeneous Programming M. Diogo and C. Grelck (2013). In Trends in Functional Programming, 13th Symposium, TFP 2012, St.Andrews, UK. pp. 279–294. Springer. DOI2013_0.pdf BibTexAbstract

@InProceedings{DiogGrelTFP12,
  author      = {M. Diogo and C. Grelck},
  title       = {Towards Heterogeneous Computing without Heterogeneous Programming},
  booktitle   = {Trends in Functional Programming, 13th Symposium, TFP 2012, St.Andrews, UK},
  year        = {2013},
  editor      = {K. Hammond and H.W. Loidl},
  volume      = {7829},
  series      = {Lecture Notes in Computer Science},
  pages       = {279--294},
  publisher   = {Springer},
  summary     = {},
  abstract    = {From laptops to supercomputer nodes hardware architectures become increasingly heterogeneous, combining at least multiple general-purpose cores with one or even multiple GPU accelerators. Taking effective advantage of such systems’ capabilities becomes increasingly important, but is even more challenging. SaC is a functional array programming language with support for fully automatic parallelization following a data-parallel approach. Typical SaC programs are good matches for both conventional multi-core processors as well as many-core accelerators. Indeed, SaC supports both architectures in an entirely compiler-directed way, but so far a choice must be made at compile time: either the compiled code utilizes multiple cores and ignores a potentially available accelerator, or it uses a single GPU while ignoring all but one core of the host system. We present a compilation scheme and corresponding runtime system support that combine both code generation alternatives to harness the computational forces of multiple general-purpose cores and multiple GPU accelerators to collaboratively execute SaC programs without explicit encoding in the programs themselves and thus without going through the hassle of heterogeneous programming.},
  category    = {core, par},
  DOI         = {10.1007/978-3-642-40447-4_18},
  URL         = {2013_0.pdf},
}

From laptops to supercomputer nodes hardware architectures become increasingly heterogeneous, combining at least multiple general-purpose cores with one or even multiple GPU accelerators. Taking effective advantage of such systems’ capabilities becomes increasingly important, but is even more challenging. SaC is a functional array programming language with support for fully automatic parallelization following a data-parallel approach. Typical SaC programs are good matches for both conventional multi-core processors as well as many-core accelerators. Indeed, SaC supports both architectures in an entirely compiler-directed way, but so far a choice must be made at compile time: either the compiled code utilizes multiple cores and ignores a potentially available accelerator, or it uses a single GPU while ignoring all but one core of the host system. We present a compilation scheme and corresponding runtime system support that combine both code generation alternatives to harness the computational forces of multiple general-purpose cores and multiple GPU accelerators to collaboratively execute SaC programs without explicit encoding in the programs themselves and thus without going through the hassle of heterogeneous programming.

2011

Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler Jing Guo, Jeyarajan Thiyagalingam and Sven-Bodo Scholz (2011). In 6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA. pp. 15–24. ACM Press. DOI2011_5.pdf BibTexAbstract

@InProceedings{GuoJeyaSchoDAMP11,
  author     = {Jing Guo and Jeyarajan Thiyagalingam and Sven-Bodo Scholz},
  title      = {Breaking the Gpu Programming Barrier with the Auto-parallelising Sac Compiler},
  booktitle  = {6th Workshop on Declarative Aspects of Multicore Programming (DAMP'11), Austin, USA},
  year       = {2011},
  pages      = {15--24},
  publisher  = {ACM Press},
  abstract   = {Over recent years, the use of Graphics Processing Units (GPUs) for general-purpose computing has become increasingly popular. The main reasons for this development are the attractive performance/price and performance/power ratios of these architectures. However, substantial performance gains from GPUs come at a price: they require extensive programming expertise and, typically, a substantial re-coding effort. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices. Directive-based approaches such as hiCUDA or OpenMP-variants offer further improvements but have not eliminated the need for the expertise on these complex architectures. Similarly, special purpose programming languages such as Microsoft's Accelerator try to lower the barrier further. They provide the programmer with a special form of GPU data structures and operations on them which are then compiled into GPU code. In this paper, we take this trend towards a completely implicit, high-level approach yet another step further. We generate CUDA code from a MATLAB-like high level functional array programming language, Single Assignment C (SaC). To do so, we identify which data structures and operations can be successfully mapped on GPUs and transform existing programs accordingly. This paper presents the first runtime results from our GPU backend and it presents the basic set of GPU-specific program optimisations that turned out to be essential. Despite our high-level program specifications, we show that for a number of benchmarks speedups between a factor of 5 and 50 can be achieved through our parallelising compiler.},
  affil      = {ctca},
  category   = {par,core,opt},
  doi        = {10.1145/1926354.1926359},
  pubaddress = {New York, NY, USA},
  url        = {2011_5.pdf},
}

Over recent years, the use of Graphics Processing Units (GPUs) for general-purpose computing has become increasingly popular. The main reasons for this development are the attractive performance/price and performance/power ratios of these architectures. However, substantial performance gains from GPUs come at a price: they require extensive programming expertise and, typically, a substantial re-coding effort. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices. Directive-based approaches such as hiCUDA or OpenMP-variants offer further improvements but have not eliminated the need for the expertise on these complex architectures. Similarly, special purpose programming languages such as Microsoft's Accelerator try to lower the barrier further. They provide the programmer with a special form of GPU data structures and operations on them which are then compiled into GPU code. In this paper, we take this trend towards a completely implicit, high-level approach yet another step further. We generate CUDA code from a MATLAB-like high level functional array programming language, Single Assignment C (SaC). To do so, we identify which data structures and operations can be successfully mapped on GPUs and transform existing programs accordingly. This paper presents the first runtime results from our GPU backend and it presents the basic set of GPU-specific program optimisations that turned out to be essential. Despite our high-level program specifications, we show that for a number of benchmarks speedups between a factor of 5 and 50 can be achieved through our parallelising compiler.

Using Openmp As an Alternative Parallelization Strategy in Sac Zheng Zhangzheng (2011). Amsterdam, Netherlands. 2011_6.pdf BibTex

@MastersThesis{Zhangzheng11,
  author  = {Zheng Zhangzheng},
  title   = {Using Openmp As an Alternative Parallelization Strategy in Sac},
  school  = {University of Amsterdam},
  year    = {2011},
  category = {par, core},
  address = {Amsterdam, Netherlands},
  url     = {2011_6.pdf},
}

Concurrent Non-deferred Reference Counting on the Microgrid: First Experiences S. Herhut, C. Joslin, S.B. Scholz, R. Poss and C. Grelck (2011). In 22nd International Symposium on Implementation and Application of Functional Languages (IFL'10), Alphen a/d Rijn, Netherlands, Revised Selected Papers. Springer. DOI2011_2.pdf BibTexAbstract

@InProceedings{HerhJoslSchoetalIFL10,
  author     = {S. Herhut and C. Joslin and S.B. Scholz and R. Poss and C. Grelck},
  title      = {Concurrent Non-deferred Reference Counting on the Microgrid: First Experiences},
  booktitle  = {22nd International Symposium on Implementation and Application of Functional Languages (IFL'10), Alphen a/d Rijn, Netherlands, Revised Selected Papers},
  year       = {2011},
  editor     = {J. Haage and M. Moraz\'an},
  volume     = {6647},
  series     = {Lecture Notes in Computer Science},
  publisher  = {Springer},
  abstract   = {We present a first evaluation of our novel approach for non-deferred reference counting on the Microgrid many-core architecture. Non-deferred reference counting is a fundamental building block of implicit heap management of functional array languages in general and Single Assignment C in particular. Existing lock-free approaches for multi-core and SMP settings do not scale well for large numbers of cores in emerging many-core platforms. We, instead, employ a dedicated core for reference counting and use asynchronous messaging to emit reference counting operations. This novel approach decouples computational workload from reference-counting overhead. Experiments using cycle-accurate simulation of a realistic Microgrid show that, by exploiting asynchronism, we are able to tolerate even worst-case reference counting loads reasonably well. Scalability is essentially limited only by the combined sequential runtime of all reference counting operations, in accordance with Amdahl’s law. Even though developed in the context of Single Assignment C and the Microgrid, our approach is applicable to a wide range of languages and platforms.},
  affil      = {ctca},
  category   = {par},
  doi        = {10.1007/978-3-642-24276-2_12},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  url        = {2011_2.pdf},
}

We present a first evaluation of our novel approach for non-deferred reference counting on the Microgrid many-core architecture. Non-deferred reference counting is a fundamental building block of implicit heap management of functional array languages in general and Single Assignment C in particular. Existing lock-free approaches for multi-core and SMP settings do not scale well for large numbers of cores in emerging many-core platforms. We, instead, employ a dedicated core for reference counting and use asynchronous messaging to emit reference counting operations. This novel approach decouples computational workload from reference-counting overhead. Experiments using cycle-accurate simulation of a realistic Microgrid show that, by exploiting asynchronism, we are able to tolerate even worst-case reference counting loads reasonably well. Scalability is essentially limited only by the combined sequential runtime of all reference counting operations, in accordance with Amdahl’s law. Even though developed in the context of Single Assignment C and the Microgrid, our approach is applicable to a wide range of languages and platforms.

2010

Compiler-support for Robust Multi-core Computing Raimund Kirner, Stephan Herhut and Sven-Bodo Scholz (2010). In 4th International Symposium on Leveraging Applications of Formal Methods, Verification and Validation. Springer Verlag. DOI2010_1.pdf BibTexAbstract

@InProceedings{KirnHerhScho10,
  author     = {Raimund Kirner and Stephan Herhut and Sven-Bodo Scholz},
  title      = {Compiler-support for Robust Multi-core Computing},
  booktitle  = {4th International Symposium on Leveraging Applications of Formal Methods, Verification and Validation},
  year       = {2010},
  publisher  = {Springer Verlag},
  abstract   = {Embedded computing is characterised by the limited availability of computing resources. Further, embedded systems are often used in safety-critical applications with real-time constraints. Thus, the software development has to follow rigorous procedures to minimise the risk of system failures. However, besides the inherent application complexities, there is also an increased technology-based complexity due to the shift to concurrent programming of multi-core systems. For such systems it is quite challenging to develop safe and resource-efficient systems. In this paper we give a plea for the need of better software development tools to cope with this challenge. For example, we outline how compilers can help to simplify the writing of fault-tolerant and robust software, which keeps the application code more compact, comprehensive, and maintainable. We take a rather extreme stand by promoting a functional programming approach. This functional programming paradigm reduces the complexity of program analysis and thus allows for more efficient and powerful techniques. We will implement an almost transparent support for robustness within the SaC research compiler, which accepts a C-like functional program as input. Compared to conventional approaches in the field of automatic software-controlled resilience, our functional setting will allow for lower overhead, making the approach interesting for embedded computing as well as for high-performance computing.},
  affil      = {ctca},
  category   = {core,par},
  doi        = {10.1007/978-3-642-16558-0_6},
  location   = {Amirandes, Crete},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC, robustness, compiler-based robustness, fault-tolerance},
  url        = {2010_1.pdf},
}

Embedded computing is characterised by the limited availability of computing resources. Further, embedded systems are often used in safety-critical applications with real-time constraints. Thus, the software development has to follow rigorous procedures to minimise the risk of system failures. However, besides the inherent application complexities, there is also an increased technology-based complexity due to the shift to concurrent programming of multi-core systems. For such systems it is quite challenging to develop safe and resource-efficient systems. In this paper we give a plea for the need of better software development tools to cope with this challenge. For example, we outline how compilers can help to simplify the writing of fault-tolerant and robust software, which keeps the application code more compact, comprehensive, and maintainable. We take a rather extreme stand by promoting a functional programming approach. This functional programming paradigm reduces the complexity of program analysis and thus allows for more efficient and powerful techniques. We will implement an almost transparent support for robustness within the SaC research compiler, which accepts a C-like functional program as input. Compared to conventional approaches in the field of automatic software-controlled resilience, our functional setting will allow for lower overhead, making the approach interesting for embedded computing as well as for high-performance computing.

An Adaptive Compilation Framework for Generic Data-parallel Array Programming Clemens Grelck, Tim van Deurzen, Stephan Herhut and Sven-Bodo Scholz (2010). In 15th Workshop on Compilers for Parallel Computing (CPC'10). Vienna University of Technology, Vienna, Austria. 2010_3.pdf BibTexAbstract

@InProceedings{GrelDeurHerhetalCPC10,
  author    = {Clemens Grelck and Tim van Deurzen and Stephan Herhut and Sven-Bodo Scholz},
  title     = {An Adaptive Compilation Framework for Generic Data-parallel Array Programming},
  booktitle = {15th Workshop on Compilers for Parallel Computing (CPC'10)},
  year      = {2010},
  publisher = {Vienna University of Technology, Vienna, Austria},
  abstract  = {Generic array programming abstracts from structural prop- erties  of  arrays,  such  as  rank  (number  of  axes/dimensions)  and  shape (number of element along each axis/dimension). This allows for abstract program  speci cations  and,  as  such,  is  desirable  from  a  software  engi- neering  perspective.  However,  generic  programming  in  this  sense  does have an adverse e ect on runtime performance, at least when executed naively. Static compiler analyses and transformations aim at reconciling software engineering desires for generic code with runtime performance requirements.  However,  they  are  bound  to  fail  whenever  the  required information is not available until runtime. We propose a compilation framework that overcomes the inherent limi- tations of static analysis by incrementally adapting a running program to the structural properties of the arrays it operates on. This is achieved by partial recompilation of code at runtime, when all structural proper- ties of arrays are known, and dynamic relinking of the running program with dynamically generated code. We sketch out the general compilation framework architecture and provide some details on implementation issues.},
  affil     = {ctca},
  category  = {par,core},
  topics    = {SAC},
  url       = {2010_3.pdf},
}

Generic array programming abstracts from structural prop- erties of arrays, such as rank (number of axes/dimensions) and shape (number of element along each axis/dimension). This allows for abstract program speci cations and, as such, is desirable from a software engi- neering perspective. However, generic programming in this sense does have an adverse e ect on runtime performance, at least when executed naively. Static compiler analyses and transformations aim at reconciling software engineering desires for generic code with runtime performance requirements. However, they are bound to fail whenever the required information is not available until runtime. We propose a compilation framework that overcomes the inherent limi- tations of static analysis by incrementally adapting a running program to the structural properties of the arrays it operates on. This is achieved by partial recompilation of code at runtime, when all structural proper- ties of arrays are known, and dynamic relinking of the running program with dynamically generated code. We sketch out the general compilation framework architecture and provide some details on implementation issues.

Thread-local Stacks, a Light-weight Alternative to Thread-local Heaps Stephan Herhut, Carl Joslin and Sven-Bodo Scholz (2010). In 15th Workshop on Compilers for Parallel Computing (CPC'10). Vienna University of Technology, Vienna, Austria. BibTex

@InProceedings{HerhJoslSchoCPC10,
  author    = {Stephan Herhut and Carl Joslin and Sven-Bodo Scholz},
  title     = {Thread-local Stacks, a Light-weight Alternative to Thread-local Heaps},
  booktitle = {15th Workshop on Compilers for Parallel Computing (CPC'10)},
  year      = {2010},
  publisher = {Vienna University of Technology, Vienna, Austria},
  affil     = {ctca},
  category  = {par,core},
  topics    = {SAC},
}

2009

Truly Nested Data-parallelism: Compiling Sac to the Microgrid Architecture Stephan Herhut, Carl Joslin, Sven-Bodo Scholz and Clemens Grelck (2009). In 21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA. Seton Hall University. 2009_0.pdf BibTexAbstract

@InProceedings{HerhJoslSchoetalIFL09draft,
  author     = {Stephan Herhut and Carl Joslin and Sven-Bodo Scholz and Clemens Grelck},
  title      = {Truly Nested Data-parallelism: Compiling Sac to the Microgrid Architecture},
  booktitle  = {21st Symposium on Implementation and Application of Functional Languages (IFL'09), South Orange, NJ, USA},
  year       = {2009},
  number     = {SHU-TR-CS-2009-09-1},
  publisher  = {Seton Hall University},
  abstract   = {Data-parallel  programming  facilitates  elegant  specification of concurrency. However, the composability of data-parallel operations so far has been constrained by the requirement to have only  at data- parallel operation at runtime. In this paper, we present early results on our work to exploit hardware support for nested concurrency to directly map nested data-parallel operations in high-level specifications to low-level codes that can be efficiently executed. To this effect, we have devised a compilation scheme from data-parallel operations in SaC to the systems programming language of the Microgrid architecture. Furthermore, we present early empirical results to assert the viability of our approach.},
  affil      = {ctca},
  category   = {par,core},
  pubaddress = {South Orange, NJ, USA},
  url        = {2009_0.pdf},
}

Data-parallel programming facilitates elegant specification of concurrency. However, the composability of data-parallel operations so far has been constrained by the requirement to have only at data- parallel operation at runtime. In this paper, we present early results on our work to exploit hardware support for nested concurrency to directly map nested data-parallel operations in high-level specifications to low-level codes that can be efficiently executed. To this effect, we have devised a compilation scheme from data-parallel operations in SaC to the systems programming language of the Microgrid architecture. Furthermore, we present early empirical results to assert the viability of our approach.

Compiling the Functional Data-parallel Language Sac for Microgrids of Self-adaptive Virtual Processors Clemens Grelck, Stephan Herhut, Chris Jesshope, Carl Joslin, Mike Lankamp, Sven-Bodo Scholz and Alex Shafarenko (2009). In 14th Workshop on Compilers for Parallel Computing (CPC'09), IBM Research Center, Zürich, Switzerland. 2009_4.pdf BibTexAbstract

@InProceedings{GrelJessJosletalCPC09,
  author    = {Clemens Grelck and Stephan Herhut and Chris Jesshope and Carl Joslin and Mike Lankamp and Sven-Bodo Scholz and Alex Shafarenko},
  title     = {Compiling the Functional Data-parallel Language Sac for Microgrids of Self-adaptive Virtual Processors},
  booktitle = {14th Workshop on Compilers for Parallel Computing (CPC'09), IBM Research Center, Z\"urich, Switzerland},
  year      = {2009},
  abstract  = {We present preliminary results from compiling the high- level, functional and data-parallel programming language SaC into a novel multi-core design: Microgrids of Self- Adaptive Virtual Processors (SVPs). The side-effect free nature of SaC in conjunction with its data-parallel foundation make it an ideal candidate for auto-parallelisation. Notwithstanding these favourable pre-conditions, scheduling and data-placement pose major challenges for effective parallelisation of irregular applications because fine-grained dynamic approaches in inflict large overheads on conventional architectures. The Microgrid architecture promises a radical shift: thread creation and context switches are implemented in hardware and cause negligible overhead. Likewise, scheduling of tasks to computing resources is catered for by hardware. This paper investigates aspects of the Microgrid architecture which may influence the overall performance of compiled data-parallel programs. In particular, we look at alternative thread creation schemes for n - dimensional, data-parallel operations and their effect on overall performance. Furthermore, we investigate the architecture's capability to cope with workload imbalances within such operations. The paper presents a sequence of experiments on a cycle-accurate emulator of the Microgrid architecture from which we derive some guiding principles for an effective compilation of data parallel operations. Based on these principles, we present a compilation scheme for the data-parallel core of SaC.},
  affil     = {ctca},
  category  = {par,design,core},
  topics    = {SAC},
  url       = {2009_4.pdf},
}

We present preliminary results from compiling the high- level, functional and data-parallel programming language SaC into a novel multi-core design: Microgrids of Self- Adaptive Virtual Processors (SVPs). The side-effect free nature of SaC in conjunction with its data-parallel foundation make it an ideal candidate for auto-parallelisation. Notwithstanding these favourable pre-conditions, scheduling and data-placement pose major challenges for effective parallelisation of irregular applications because fine-grained dynamic approaches in inflict large overheads on conventional architectures. The Microgrid architecture promises a radical shift: thread creation and context switches are implemented in hardware and cause negligible overhead. Likewise, scheduling of tasks to computing resources is catered for by hardware. This paper investigates aspects of the Microgrid architecture which may influence the overall performance of compiled data-parallel programs. In particular, we look at alternative thread creation schemes for n - dimensional, data-parallel operations and their effect on overall performance. Furthermore, we investigate the architecture's capability to cope with workload imbalances within such operations. The paper presents a sequence of experiments on a cycle-accurate emulator of the Microgrid architecture from which we derive some guiding principles for an effective compilation of data parallel operations. Based on these principles, we present a compilation scheme for the data-parallel core of SaC.

Controllling Chaos — on Safe Side-effects in Data-parallel Operations Stephan Herhut, Sven-Bodo Scholz and Clemens Grelck (2009). In 4th Workshop on Declarative Aspects of Multicore Programming (DAMP'09), Savannah, USA. pp. 59–67. ACM Press. DOI2009_5.pdf BibTexAbstract

@InProceedings{HerhSchoGrelDAMP09,
  author     = {Stephan Herhut and Sven-Bodo Scholz and Clemens Grelck},
  title      = {Controllling Chaos --- on Safe Side-effects in Data-parallel Operations},
  booktitle  = {4th Workshop on Declarative Aspects of Multicore Programming (DAMP'09), Savannah, USA},
  year       = {2009},
  editor     = {Manuel Chakravarty and Leaf Peterson},
  pages      = {59--67},
  publisher  = {ACM Press},
  abstract   = {With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.},
  affil      = {ctca},
  category   = {par,states},
  doi        = {10.1145/1481839.1481847},
  isbn       = {978-1-60558-417-1},
  pubaddress = {New York, NY, USA},
  topics     = {SAC},
  url        = {2009_5.pdf},
}

With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.

Controllling Chaos — on Safe Side-effects in Data-parallel Operations Stephan Herhut, Sven-Bodo Scholz and Clemens Grelck (2009). ACM SIGPLAN Notices 44 (5) pp. 9–10. 2009_6.pdf BibTexAbstract

@Article{HerhSchoGrelSPN09,
  author   = {Stephan Herhut and Sven-Bodo Scholz and Clemens Grelck},
  title    = {Controllling Chaos --- on Safe Side-effects in Data-parallel Operations},
  journal  = {ACM SIGPLAN Notices},
  year     = {2009},
  volume   = {44},
  number   = {5},
  pages    = {9--10},
  abstract = {With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.},
  affil    = {ctca},
  category = {par,states},
  issn     = {0362-1340},
  topics   = {SAC},
  url      = {2009_6.pdf},
}

With the rising variety of hardware designs for multi-core systems, the effectiveness in exploiting implicit concurrency of programs plays a more vital role for programming such systems than ever before. We believe that a combination of a data-parallel approach with a declarative programming-style is up to that task: Data-parallel approaches are known to enable compilers to make efficient use of multi-processors without requiring low-level program annotations. Combining the data-parallel approach with a declarative programming-style guarantees semantic equivalence between sequential and concurrent executions of data parallel operations. Furthermore, the side-effect free setting and explicit model of dependencies enables compilers to maximise the size of the data-parallel program sections. However, the strength of the rigidity of the declarative approach also constitutes its weakness: Being bound to observe all data dependencies categorically rules out the use of side-effecting operations within data-parallel sections. Not only does this limit the size of these regions in certain situations, but it may also hamper an effective workload distribution. Considering side effects such as plotting individual pixels of an image or output for debugging purposes, there are situations where a non-deterministic order of side-effects would not be considered harmful at all. We propose a mechanism for enabling such non-determinism on the execution of side-effecting operations within data-parallel sections without sacrificing the side-effect free setting in general. Outside of the data-parallel sections we ensure single-threading of side-effecting operations using uniqueness typing. Within data-parallel operations however we allow the side-effecting operations of different threads to occur in any order, as long as effects of different threads are not interleaved. Furthermore, we still model the dependencies arising from the manipulated states within the data parallel sections. This measure preserves the explicitness of all data dependencies and therefore it preserves the transformational potential of any restructuring compiler.

2008

Efficient Heap Management for Declarative Data Parallel Programming on Multicores Clemens Grelck and Sven-Bodo Scholz (2008). In 3rd Workshop on Declarative Aspects of Multicore Programming (DAMP'08), San Francisco, CA, USA. pp. 17–31. ACM Press. 2008_1.pdf BibTexAbstract

@InProceedings{GrelSchoDAMP08,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Efficient Heap Management for Declarative Data Parallel Programming on Multicores},
  booktitle  = {3rd Workshop on Declarative Aspects of Multicore Programming (DAMP'08), San Francisco, CA, USA},
  year       = {2008},
  editor     = {Manuel Hermenegildo and Leaf Peterson and Neal Glew},
  pages      = {17--31},
  publisher  = {ACM Press},
  affil      = {ctca},
  pubaddress = {New York, NY, USA},
  topics     = {SAC},
  summary    = {},
  abstract   = {Declarative data parallel programming for shared memory multiprocessor systems implies paradigm-specific demands on the organisation of memory management. As a key feature of declarative programming implicit memory management is indispensable. Yet, the memory objects to be managed are very different from those that are predominant in general-purpose functional or object-oriented languages. Rather than complex structures of relatively small items interconnected by references, we are faced with large chunks of memory, usually arrays, which often account for 100s of MB each. Such sizes make relocation of data or failure to update arrays in-place prohibitively expensive. To address these challenges of the data parallel setting, the functional array language SaC employs continuous garbage collection via reference counting combined with several aggressive optimisation techniques. How- ever, we have observed that overall memory performance does not only rely on efficient reference counting techniques, but to a similar extent on the underlying memory allocation strategy. As in the general memory management setting we can identify specific demands of the declarative data parallel setting on heap organisation. In this paper, we propose a heap manager design tailor-made to the needs of concurrent executions of declarative data parallel programs whose memory management is based on reference counting. We present run- time measurements that quantify the impact of the proposed design and relate it to the performance of several different general purpose heap managers that are available in the public domain.},
  category   = {par, core},
  DOI        = {},
  URL        = {2008_1.pdf},
}

Declarative data parallel programming for shared memory multiprocessor systems implies paradigm-specific demands on the organisation of memory management. As a key feature of declarative programming implicit memory management is indispensable. Yet, the memory objects to be managed are very different from those that are predominant in general-purpose functional or object-oriented languages. Rather than complex structures of relatively small items interconnected by references, we are faced with large chunks of memory, usually arrays, which often account for 100s of MB each. Such sizes make relocation of data or failure to update arrays in-place prohibitively expensive. To address these challenges of the data parallel setting, the functional array language SaC employs continuous garbage collection via reference counting combined with several aggressive optimisation techniques. How- ever, we have observed that overall memory performance does not only rely on efficient reference counting techniques, but to a similar extent on the underlying memory allocation strategy. As in the general memory management setting we can identify specific demands of the declarative data parallel setting on heap organisation. In this paper, we propose a heap manager design tailor-made to the needs of concurrent executions of declarative data parallel programs whose memory management is based on reference counting. We present run- time measurements that quantify the impact of the proposed design and relate it to the performance of several different general purpose heap managers that are available in the public domain.

A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o Clemens Grelck, Steffen Kuthe and Sven-Bodo Scholz (2008). Parallel Processing Letters 18 (1) pp. 23–37. BibTexAbstract

@Article{GrelKuthSchoPPL08,
  author   = {Clemens Grelck and Steffen Kuthe and Sven-Bodo Scholz},
  title    = {A Hybrid Shared Memory Execution Model for a Data Parallel Language with I/o},
  journal  = {Parallel Processing Letters},
  year     = {2008},
  volume   = {18},
  number   = {1},
  pages    = {23--37},
  abstract = {We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.},
  affil    = {ctca},
  category = {par,core},
  topics   = {SAC},
}

We propose a novel execution model for the implicitly parallel execution of data parallel programs in the presence of general I/O operations. This model is called hybrid because it combines the advantages of the standard execution models fork/join and SPMD. Based on program analysis the hybrid model adapts itself to one or the other on the granularity of individual instructions. We outline compilation techniques that systematically derive the organization of parallel code from data flow characteristics aiming at the reduction of execution mode switches in general and synchronization/communication requirements in particular. Experiments based on a prototype implementation show the effectiveness of the hybrid execution model for reducing parallel overhead.

2007

Coordinating Data Parallel Sac Programs with S-net Clemens Grelck, Sven-Bodo Scholz and Alex Shafarenko (2007). In 21st IEEE International Parallel and Distributed Processing Symposium (IPDPS'07), Long Beach, California, USA. IEEE Computer Society Press, Los Alamitos, California, USA. cdpspws.pdf BibTexAbstract

@InProceedings{GrelSchoShafIPDPS07,
  author    = {Clemens Grelck and Sven-Bodo Scholz and Alex Shafarenko},
  title     = {Coordinating Data Parallel Sac Programs with S-net},
  booktitle = {21st IEEE International Parallel and Distributed Processing Symposium (IPDPS'07), Long Beach, California, USA},
  year      = {2007},
  publisher = {IEEE Computer Society Press, Los Alamitos, California, USA},
  abstract  = {We propose a two-layered approach for exploiting different forms of concurrency in complex systems: We specify computational components in our functional array language SaC, which exploits data parallel properties of array processing code. The declarative stream processing language S-Net is used to orchestrate the collaborative behaviour of these components in a streaming network. We illustrate our approach by a hybrid implementation of a sudoku puzzle solver as a representative for more complex search problems.},
  affil     = {ctca},
  category  = {par},
  topics    = {SAC,SNet},
  url       = {cdpspws.pdf},
}

We propose a two-layered approach for exploiting different forms of concurrency in complex systems: We specify computational components in our functional array language SaC, which exploits data parallel properties of array processing code. The declarative stream processing language S-Net is used to orchestrate the collaborative behaviour of these components in a streaming network. We illustrate our approach by a hybrid implementation of a sudoku puzzle solver as a representative for more complex search problems.

Sac: Off-the-shelf Support for Data-parallelism on Multicores Clemens Grelck and Sven-Bodo Scholz (2007). In 2nd Workshop on Declarative Aspects of Multicore Programming (DAMP'07), Nice, France. pp. 25–33. ACM Press. DOIdamp07.pdf BibTexAbstract

@InProceedings{GrelSchoDAMP07,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Sac: Off-the-shelf Support for Data-parallelism on Multicores},
  booktitle  = {2nd Workshop on Declarative Aspects of Multicore Programming (DAMP'07), Nice, France},
  year       = {2007},
  editor     = {Neal Glew and Guy Blelloch},
  pages      = {25--33},
  publisher  = {ACM Press},
  abstract   = {The advent of multicore processors has raised new demand for harnessing concurrency in the software mass market. We summarise our previous work on the data parallel, functional array processing language SAC. Its compiler technology is geared towards highly runtime-efficient support for shared memory multiprocessors and, thus, is readily applicable to today’s off-the-shelf multicore systems. Following a brief introduction to the language itself, we identify the major compilation challenges and outline the solutions we have developed.},
  affil      = {ctca},
  category   = {par},
  contents   = {ISBN:978-1-59593-690-5},
  doi        = {10.1145/1248648.1248654},
  pubaddress = {New York, NY, USA},
  topics     = {SAC},
  url        = {damp07.pdf},
}

The advent of multicore processors has raised new demand for harnessing concurrency in the software mass market. We summarise our previous work on the data parallel, functional array processing language SAC. Its compiler technology is geared towards highly runtime-efficient support for shared memory multiprocessors and, thus, is readily applicable to today’s off-the-shelf multicore systems. Following a brief introduction to the language itself, we identify the major compilation challenges and outline the solutions we have developed.

2005

Shared Memory Multiprocessor Support for Functional Array Processing in Sac Clemens Grelck (2005). Journal of Functional Programming 15 (3) pp. 353–401. DOIsmmsffapis.pdf BibTexAbstract

@Article{GrelckJFP05,
  author   = {Clemens Grelck},
  title    = {Shared Memory Multiprocessor Support for Functional Array Processing in Sac},
  journal  = {Journal of Functional Programming},
  year     = {2005},
  volume   = {15},
  number   = {3},
  pages    = {353--401},
  abstract = {Classical application domains of parallel computing are dominated by processing large arrays of numerical data. Whereas most functional languages focus on lists and trees rather than on arrays, SAC is tailor-made in design and in implementation for efficient high-level array processing. Advanced compiler optimizations yield performance levels that are often competitive with low-level imperative implementations. Based on SAC, we develop compilation techniques and runtime system support for the compiler-directed parallel execution of high-level functional array processing code on shared memory architectures. Competitive sequential performance gives us the opportunity to exploit the conceptual advantages of the functional paradigm for achieving real performance gains with respect to existing imperative implementations, not only in comparison with uniprocessor runtimes. While the design of SAC facilitates parallelization, the particular challenge of high sequential performance is that realization of satisfying speedups through parallelization becomes substantially more difficult. We present an initial compilation scheme and multi-threaded execution model, which we step-wise refine to reduce organizational overhead and to improve parallel performance. We close with a detailed analysis of the impact of certain design decisions on runtime performance, based on a series of experiments.},
  category = {design, par},
  contents = {Canonical reference for SAC multithreading},
  doi      = {10.1017/S0956796805005538},
  topics   = {SAC},
  url      = {smmsffapis.pdf},
}

Classical application domains of parallel computing are dominated by processing large arrays of numerical data. Whereas most functional languages focus on lists and trees rather than on arrays, SAC is tailor-made in design and in implementation for efficient high-level array processing. Advanced compiler optimizations yield performance levels that are often competitive with low-level imperative implementations. Based on SAC, we develop compilation techniques and runtime system support for the compiler-directed parallel execution of high-level functional array processing code on shared memory architectures. Competitive sequential performance gives us the opportunity to exploit the conceptual advantages of the functional paradigm for achieving real performance gains with respect to existing imperative implementations, not only in comparison with uniprocessor runtimes. While the design of SAC facilitates parallelization, the particular challenge of high sequential performance is that realization of satisfying speedups through parallelization becomes substantially more difficult. We present an initial compilation scheme and multi-threaded execution model, which we step-wise refine to reduce organizational overhead and to improve parallel performance. We close with a detailed analysis of the impact of certain design decisions on runtime performance, based on a series of experiments.

2003

A Multithreaded Compiler Backend for High-level Array Programming Clemens Grelck (2003). In 2nd International Conference on Parallel and Distributed Computing and Networks (PDCN'03), Innsbruck, Austria. pp. 478–484. ACTA Press. DOIamcbfhlap.pdf BibTexAbstract

@InProceedings{GrelckPDCN03,
  author     = {Clemens Grelck},
  title      = {A Multithreaded Compiler Backend for High-level Array Programming},
  booktitle  = {2nd International Conference on Parallel and Distributed Computing and Networks (PDCN'03), Innsbruck, Austria},
  year       = {2003},
  editor     = {Mohammed H. Hamza},
  pages      = {478--484},
  publisher  = {ACTA Press},
  abstract   = {Whenever large homogeneous data structures need to be processed in a non-trivial way, e.g. in computational sciences, image processing, or system simulation, high-level array programming in the style of APL offers a far more concise and abstract approach than traditional scalar languages such as C/C++ or FORTRAN-77. The same sort of applications often can also be characterized as performance critical and today represents the major domain for parallel processing. This paper reports on the development of a compiler backend which allows to implicitly generate multithreaded code from high-level array program specifications. On shared memory multiprocessor systems, this code can be executed in parallel without any additional programming effort. After sketching out basic compilation schemes, optimizations on the runtime system are addressed and, finally, experimental runtime figures are presented.},
  category   = {core, par, opt},
  contents   = {[ISBN 0-88986-341-5],[ISSN 1027-2666]},
  doi        = {10.1.1.408.8553},
  pubaddress = {Anaheim, CA, USA},
  sourceurl  = {http://www.isp.mu-luebeck.de/~grelck/publications/mt-backend-innsbruck-03.ps.gz},
  topics     = {SAC,Shared Memory},
  url        = {AMCBFHLAP.pdf},
}

Whenever large homogeneous data structures need to be processed in a non-trivial way, e.g. in computational sciences, image processing, or system simulation, high-level array programming in the style of APL offers a far more concise and abstract approach than traditional scalar languages such as C/C++ or FORTRAN-77. The same sort of applications often can also be characterized as performance critical and today represents the major domain for parallel processing. This paper reports on the development of a compiler backend which allows to implicitly generate multithreaded code from high-level array program specifications. On shared memory multiprocessor systems, this code can be executed in parallel without any additional programming effort. After sketching out basic compilation schemes, optimizations on the runtime system are addressed and, finally, experimental runtime figures are presented.

2000

DOIhpf-vs-sac-muenchen-00.pdf BibTexAbstract

@InProceedings{GrelSchoEUROPAR00,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Hpf Vs. Sac --- a Case Study},
  booktitle  = {Euro-Par 2000 Parallel Processing, 6th International Euro-Par Conference (Euro-Par'00), Munich, Germany},
  year       = {2000},
  editor     = {Arndt Bode and Thomas Ludwig and Wolfgang Karl and Roland Wismüller},
  volume     = {1900},
  series     = {Lecture Notes in Computer Science},
  pages      = {620--624},
  publisher  = {Springer},
  abstract   = {This paper compares the functional programming language Sac to Hpf with respect to specificational elegance and runtime performance. A well-known benchmark, red-black SOR, serves as a case study. After presenting the Hpf reference implementation alternative Sac implementations are discussed. Eventually, performance figures show the ability to compile highly generic Sac specifications into machine code that outperforms the Hpf implementation on a shared memory multi-processor by a factor of about 3.},
  category   = {core,apps,par},
  doi        = {10.1007/3-540-44520-X_87},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper compares SAC to HPF in terms of expressiveness and execution runtimes. Red/black successive over-relaxation serves as a case study. Several SAC specifications are presented and compared with a HPF reference implementation. Performance figures for up to 10 processors of a shared memory multiprocessor are presented.},
  topics     = {SAC,HPF},
  url        = {hpf-vs-sac-muenchen-00.pdf},
}

This paper compares the functional programming language Sac to Hpf with respect to specificational elegance and runtime performance. A well-known benchmark, red-black SOR, serves as a case study. After presenting the Hpf reference implementation alternative Sac implementations are discussed. Eventually, performance figures show the ability to compile highly generic Sac specifications into machine code that outperforms the Hpf implementation on a shared memory multi-processor by a factor of about 3.

array-padding-lasvegas-00.pdf BibTexAbstract

@InProceedings{GrelckPDPTA00,
  author    = {Clemens Grelck},
  title     = {Array Padding in the Functional Language Sac},
  booktitle = {International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'00), Las Vegas, Nevada, USA},
  year      = {2000},
  editor    = {Hamid R. Arabnia},
  volume    = {5},
  pages     = {2553--2560},
  publisher = {CSREA Press, Athens, Georgia, USA},
  abstract  = {Sac is a functional array processing language that tries to combine generic, high-level program specifications with efficient runtime behavior. Being particularly designed for numerical applications, runtime performance critically depends on the effective utilization of the memory hierarchy. For many programs, however, it can be observed that the achieved performance significantly changes with small variations in the problem size. Array padding is a well-known optimization technique that adjusts the data layout of arrays in order to make better usage of caches. The paper presents an algorithm that derives a customized data layout from an array access pattern and a cache specification. Cache phenomena such as spatial and temporal reuse are taken into account as well as different cache architectures. The effectiveness is demonstrated by investigations on the runtime performance of the PDE1 benchmark on a shared memory multiprocessor.},
  category  = {core,opt,par},
  summary   = {This paper describes an inference strategy to identify suitable pad sizes for intra-array padding in order to eliminate self interference cache conflicts in numerical algorithms. Its performance impact in particular with respect to multithreaded program execution is shown by means of runtime figures.},
  topics    = {SAC,Array Data Layout},
  url       = {array-padding-lasvegas-00.pdf},
}

Sac is a functional array processing language that tries to combine generic, high-level program specifications with efficient runtime behavior. Being particularly designed for numerical applications, runtime performance critically depends on the effective utilization of the memory hierarchy. For many programs, however, it can be observed that the achieved performance significantly changes with small variations in the problem size. Array padding is a well-known optimization technique that adjusts the data layout of arrays in order to make better usage of caches. The paper presents an algorithm that derives a customized data layout from an array access pattern and a cache specification. Cache phenomena such as spatial and temporal reuse are taken into account as well as different cache architectures. The effectiveness is demonstrated by investigations on the runtime performance of the PDE1 benchmark on a shared memory multiprocessor.

1999

DOImt-support-london-98.pdf BibTexAbstract

@InProceedings{GrelckIFL98,
  author     = {Clemens Grelck},
  title      = {Shared Memory Multiprocessor Support for Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {38--54},
  publisher  = {Springer},
  abstract   = {Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.},
  category   = {core,design,par,opt},
  doi        = {10.1007/3-540-48515-5_3},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper introduces the basic concepts of multi-threaded program execution on shared memory multiprocessor systems. The described approach is completely implicit, i.e. the compiler and the runtime system handle everything from granularity to load balancing on their own. The paper also contains preliminary performance evaluations.},
  topics     = {SAC,Shared Memory},
  url        = {mt-support-london-98.pdf},
}

Sac (Single Assignment C) is a strict, purely functional programming language primarily designed with numerical applications in mind. Particular emphasis is on efficient support for arrays both in terms of language expressiveness and in terms of runtime performance. Array operations in Sac are based on element-wise specifications using so-called With-loops. These language constructs are also well-suited for concurrent execution on multiprocessor systems. This paper outlines an implicit approach to compile Sac programs for multi-threaded execution on shared memory architectures. Besides the basic compilation scheme, a brief overview of the runtime system is given. Finally, preliminary performance figures demonstrate that this approach is well-suited to achieve almost linear speedups.

2004

Towards Fully Controlled Overloading across Module Boundaries S. Herhut and S.-B. Scholz (2004). In 16th International Workshop on the Implementation and Application of Functional Languages (IFL'04), Lübeck, Germany. pp. 395–408. University of Kiel. DOItfcoamb.pdf BibTexAbstract

@InProceedings{HerhSchoIFL04,
  author    = {S. Herhut and S.-B. Scholz},
  title     = {Towards Fully Controlled Overloading across Module Boundaries},
  booktitle = {16th International Workshop on the Implementation and Application of Functional Languages (IFL'04), L\"ubeck, Germany},
  year      = {2004},
  editor    = {C. Grelck and F. Huch},
  pages     = {395--408},
  publisher = {University of Kiel},
  abstract  = {This paper proposes a set of modularisation constructs as well as a new implementation technique for overloading functions across module boundaries. In contrast to existing approaches, it allows to fully preserve separation of namespaces without restricting the potential for later overloadings, including mutual recursion across module boundaries. This gives the programmer control over the visibility of individual func-tion definitions within each namespace and, thus, allows several different sets of instances for a single overloaded function to be used within one program. Based on a very simple applied λ-calculus as core language, the mod-ularisation constructs are defined and a transformation scheme into an applied λ-calculus is presented. Furthermore, an outline of an implemen-tation in the context of the functional programming language SaC is given.},
  affil     = {ctca},
  category  = {design, modules},
  doi       = {10.1.1.408.4298},
  topics    = {SAC,on SAC},
  url       = {TFCOAMB.pdf},
}

This paper proposes a set of modularisation constructs as well as a new implementation technique for overloading functions across module boundaries. In contrast to existing approaches, it allows to fully preserve separation of namespaces without restricting the potential for later overloadings, including mutual recursion across module boundaries. This gives the programmer control over the visibility of individual func-tion definitions within each namespace and, thus, allows several different sets of instances for a single overloaded function to be used within one program. Based on a very simple applied λ-calculus as core language, the mod-ularisation constructs are defined and a transformation scheme into an applied λ-calculus is presented. Furthermore, an outline of an implemen-tation in the context of the functional programming language SaC is given.

2000

c-interface-aachen-00.pdf BibTexAbstract

@InProceedings{MarcSchoIFL00draft,
  author    = {Nico Marcussen-Wulff and Sven-Bodo Scholz},
  title     = {On Interfacing Sac Modules with C Programs},
  booktitle = {12th International Workshop on Implementation of Functional Languages (IFL'00), Aachen, Germany},
  year      = {2000},
  editor    = {Markus Mohnen and Pieter Koopman},
  volume    = {AIB-00-7},
  series    = {Aachener Informatik-Berichte},
  pages     = {381--386},
  publisher = {Technical University of Aachen},
  abstract  = {This paper is on interfacing C programs with modules written in Sac, a functional language with powerful array processing facilities. Interfacing program modules written in different languages has become an increasingly important issue of economic software design. It allows to re-use as much as possible existing code (program libraries) and to import special language features not available in the language in which the main program is written.},
  category  = {ffi,modules},
  summary   = {This draft paper describes an interface which allows to use SAC modules from C programs.},
  topics    = {SAC,Language Interfacing},
  url       = {c-interface-aachen-00.pdf},
}

This paper is on interfacing C programs with modules written in Sac, a functional language with powerful array processing facilities. Interfacing program modules written in different languages has become an increasingly important issue of economic software design. It allows to re-use as much as possible existing code (program libraries) and to import special language features not available in the language in which the main program is written.

1996

@MastersThesis{Grelck96,
  author   = {Clemens Grelck},
  title    = {Integration Eines Modul- Und Klassen-konzeptes in Die Funktionale Programmiersprache Sac -- Single Assignment C},
  school   = {Institut für Informatik und Praktische Mathematik, Christian-Albrechts-Universität},
  year     = {1996},
  address  = {Kiel, Germany},
  category = {design,core,modules},
  summary  = {Clemens' Diploma thesis is a detailed description of the module system, the integration of states and state modifications including the realization of powerful I/O facilities, and the foreign language interface which allows to use functions implemented in C within SAC programs. Both language design and implementation issues are covered.},
  topics   = {SAC,Uniqueness Types,Language Interfacing},
  url      = {sac2c-modules-classes-dipl-cg-96.pdf},
}

2015

DOI BibTexAbstract

@InProceedings{VSSIFL2015,
 author = {Viessmann, Hans-Nikolai and Scholz, Sven-Bodo and Sinkarovs, Artjoms and Bainbridge, Brian and Hamilton, Brian and Flower, Simon},
 title = {Making Fortran Legacy Code More Functional: Using the BGS Geomagnetic Field Modelling System As an Example},
 booktitle = {Proceedings of the 27th Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '15},
 year = {2015},
 isbn = {978-1-4503-4273-5},
 location = {Koblenz, Germany},
 pages = {11:1--11:13},
 articleno = {11},
 numpages = {13},
 doi = {10.1145/2897336.2897348},
 acmid = {2897348},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {eigensystem, foreign function interface, fortran, functional programming, high-performance computing, single-assignment C},
 summary = {This paper presents the support for calling SaC from Fortran and it reports about the experiences in the context of the BGS Geomagnetic Field Modelling System code.},
 abstract  = {This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.},
 category = {core,ffi,apps,par},
}

This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.

2000

c-interface-aachen-00.pdf BibTexAbstract

@InProceedings{MarcSchoIFL00draft,
  author    = {Nico Marcussen-Wulff and Sven-Bodo Scholz},
  title     = {On Interfacing Sac Modules with C Programs},
  booktitle = {12th International Workshop on Implementation of Functional Languages (IFL'00), Aachen, Germany},
  year      = {2000},
  editor    = {Markus Mohnen and Pieter Koopman},
  volume    = {AIB-00-7},
  series    = {Aachener Informatik-Berichte},
  pages     = {381--386},
  publisher = {Technical University of Aachen},
  abstract  = {This paper is on interfacing C programs with modules written in Sac, a functional language with powerful array processing facilities. Interfacing program modules written in different languages has become an increasingly important issue of economic software design. It allows to re-use as much as possible existing code (program libraries) and to import special language features not available in the language in which the main program is written.},
  category  = {ffi,modules},
  summary   = {This draft paper describes an interface which allows to use SAC modules from C programs.},
  topics    = {SAC,Language Interfacing},
  url       = {c-interface-aachen-00.pdf},
}

This paper is on interfacing C programs with modules written in Sac, a functional language with powerful array processing facilities. Interfacing program modules written in different languages has become an increasingly important issue of economic software design. It allows to re-use as much as possible existing code (program libraries) and to import special language features not available in the language in which the main program is written.

2023

Rank-Polymorphism for Shape-Guided Blocking Artjoms Šinkarovs, Thomas Koopman and Sven-Bodo Scholz (2023). In Proceedings of the 11th ACM SIGPLAN International Workshop on Functional High-Performance and Numerical Computing. pp. 1–14. Association for Computing Machinery. New York, NY, USA. DOI BibTexAbstract

@inproceedings{ShinKoopSchoFHPNC23,
 author = {\v{S}inkarovs, Artjoms and Koopman, Thomas and Scholz, Sven-Bodo},
 title = {Rank-Polymorphism for Shape-Guided Blocking},
 year = {2023},
 isbn = {9798400702969},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 artefact = {https://dl.acm.org/do/10.1145/3580402/},
 doi = {10.1145/3609024.3609410},
 abstract = {Many numerical algorithms on matrices or tensors can be formulated in a blocking style which improves performance due to better cache locality. In imperative languages, blocking is achieved by introducing additional layers of loops in a nested fashion alongside with suitable adjustments in index computations. While this process is tedious and error-prone, it is also difficult to implement a generically blocked version that would support arbitrary levels of blocking. At the example of matrix multiply, this paper demonstrates how rank-polymorphic array languages enable the specification of such generically blocked algorithms in a simple, recursive form. The depth of the blocking as well as blocking factors can be encoded in the structure of array shapes. In turn, reshaping arrays makes it possible to switch between blocked and non-blocked arrays. Through rank-polymorphic array combinators, any specification of loop boundaries or explicit index computations can be avoided. Firstly, we propose a dependently-typed framework for rank-polymorphic arrays. We use it to demonstrate that all blocked algorithms can be naturally derived by induction on the argument shapes. Our framework guarantees lack of out-of-bound indexing, and we also prove that all the blocked versions compute the same results as the canonical algorithm. Secondly, we translate our specification to the array language SaC. Not only do we show that we achieve similar conciseness in the implementation, but we also observe good performance of the generated code. We achieve a 7\% improvement compared to the highly-optimised OpenBLAS library, and 3\% compared to Intel’s MKL library when running on a 32-core shared-memory system.},
 booktitle = {Proceedings of the 11th ACM SIGPLAN International Workshop on Functional High-Performance and Numerical Computing},
 pages = {1--14},
 numpages = {14},
 keywords = {Matrix Multiply, Dependent Types, Array Programming, HPC, Rank-Polymorphism},
 location = {Seattle, WA, USA},
 category = {par,apps},
 series = {FHPNC 2023},
}

Many numerical algorithms on matrices or tensors can be formulated in a blocking style which improves performance due to better cache locality. In imperative languages, blocking is achieved by introducing additional layers of loops in a nested fashion alongside with suitable adjustments in index computations. While this process is tedious and error-prone, it is also difficult to implement a generically blocked version that would support arbitrary levels of blocking. At the example of matrix multiply, this paper demonstrates how rank-polymorphic array languages enable the specification of such generically blocked algorithms in a simple, recursive form. The depth of the blocking as well as blocking factors can be encoded in the structure of array shapes. In turn, reshaping arrays makes it possible to switch between blocked and non-blocked arrays. Through rank-polymorphic array combinators, any specification of loop boundaries or explicit index computations can be avoided. Firstly, we propose a dependently-typed framework for rank-polymorphic arrays. We use it to demonstrate that all blocked algorithms can be naturally derived by induction on the argument shapes. Our framework guarantees lack of out-of-bound indexing, and we also prove that all the blocked versions compute the same results as the canonical algorithm. Secondly, we translate our specification to the array language SaC. Not only do we show that we achieve similar conciseness in the implementation, but we also observe good performance of the generated code. We achieve a 7% improvement compared to the highly-optimised OpenBLAS library, and 3% compared to Intel’s MKL library when running on a 32-core shared-memory system.

2022

DOIsinkarovsscholzarray22.pdf BibTexAbstract

@inproceedings{SinkSchoARRAY22,
 author = {\v{S}inkarovs, Artjoms and Scholz, Sven-Bodo},
 title = {Parallel Scan as a Multidimensional Array Problem},
 year = {2022},
 isbn = {9781450392693},
 publisher = {Association for Computing Machinery},
 address = {New York, NY, USA},
 url = {SinkarovsScholzARRAY22.pdf},
 doi = {10.1145/3520306.3534500},
 summary = {The true power of rank-polymorphism at the example of scan.},
 category = {apps, par},
 abstract = {For many algorithms, it is challenging to identify a suitable parallel version, as the design space is typically very large. In this paper we demonstrate how rank-polymorphic array languages can be used as a tool to explore such design spaces through concise high-level specifications. If input data can be organised into a multi-dimensional array, and the algorithm can be stated as a recursive traversal over sub-arrays, array languages offer a lot of expressive power. The reason for this is that array shapes can be used to guide recursive traversals. Conciseness of specifications comes from array reshapes that move the desired elements into canonical hyperplanes. As a case study, we discuss several variants of implementing prefix sums (also known as scans) in SaC. We demonstrate how small code adjustments suffice to change the concurrency pattern exposed to the compiler. It turns out that variability that is typically achieved by generic inductive data types such as binary trees is a special case of what is offered by the array paradigm.},
 booktitle = {Proceedings of the 8th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
 pages = {1–11},
 numpages = {11},
 keywords = {rank polymorphism, array languages, algorithms, parallelism, functional programming, prefix sum},
 location = {San Diego, CA, USA}, series = {ARRAY 2022} 
}

For many algorithms, it is challenging to identify a suitable parallel version, as the design space is typically very large. In this paper we demonstrate how rank-polymorphic array languages can be used as a tool to explore such design spaces through concise high-level specifications. If input data can be organised into a multi-dimensional array, and the algorithm can be stated as a recursive traversal over sub-arrays, array languages offer a lot of expressive power. The reason for this is that array shapes can be used to guide recursive traversals. Conciseness of specifications comes from array reshapes that move the desired elements into canonical hyperplanes. As a case study, we discuss several variants of implementing prefix sums (also known as scans) in SaC. We demonstrate how small code adjustments suffice to change the concurrency pattern exposed to the compiler. It turns out that variability that is typically achieved by generic inductive data types such as binary trees is a special case of what is offered by the array paradigm.

2021

DOIcnn-in-sac-array21.pdf BibTexAbstract

@inproceedings{SinkViesSchoARRAY21,
 author = {Artjoms Šinkarovs and Hans Viessmann and Sven-Bodo Scholz},
 title = {Array Languages Make Neural Networks Fast},
 year = {2021},
 isbn = {9781450384667},
 publisher = {ACM},
 address = {New York, NY, USA},
 url = {cnn-in-sac-ARRAY21.pdf},
 doi = {10.1145/3315454.3464312},
 summary = {Demonstrates how SaC combines high-productivity with high-performance: less tan 100 lines for a CNN from scratch with competitive performance.},
 abstract = {Most implementations of machine learning algorithms are based on special-purpose frameworks such as TensorFlow or PyTorch. While these frameworks are convenient to use, they introduce multi-million lines of code dependency that one has to trust, understand and potentially modify. As an alternative, this paper investigates a direct implementation of a state of the art Convolutional Neural Network (CNN) in an array language. While our implementation requires 150 lines of code to define the special-purpose operators needed for CNNs, which are readily provided through frameworks such as TensorFlow and PyTorch, our implementation out-performs these frameworks by factors 2 and 3 on a fixed set of hardware — a 64-core GPU-accelerated machine; for a simple example network. The resulting specification is written in a rank-polymorphic data-parallel style, and it can be immediately leveraged by optimising compilers. Indeed, array languages make neural networks fast.},
 booktitle = {Proceedings of the 6th ACM SIGPLAN International Workshop on Libraries, Languages and Compilers for Array Programming},
 numpages = {12},
 keywords = {machine learning,array languages},
 category = {apps},
 location = {Virtual,Canada},
 series = {ARRAY 2021}
}

Most implementations of machine learning algorithms are based on special-purpose frameworks such as TensorFlow or PyTorch. While these frameworks are convenient to use, they introduce multi-million lines of code dependency that one has to trust, understand and potentially modify. As an alternative, this paper investigates a direct implementation of a state of the art Convolutional Neural Network (CNN) in an array language. While our implementation requires 150 lines of code to define the special-purpose operators needed for CNNs, which are readily provided through frameworks such as TensorFlow and PyTorch, our implementation out-performs these frameworks by factors 2 and 3 on a fixed set of hardware — a 64-core GPU-accelerated machine; for a simple example network. The resulting specification is written in a rank-polymorphic data-parallel style, and it can be immediately leveraged by optimising compilers. Indeed, array languages make neural networks fast.

2015

DOI BibTexAbstract

@InProceedings{VSSIFL2015,
 author = {Viessmann, Hans-Nikolai and Scholz, Sven-Bodo and Sinkarovs, Artjoms and Bainbridge, Brian and Hamilton, Brian and Flower, Simon},
 title = {Making Fortran Legacy Code More Functional: Using the BGS Geomagnetic Field Modelling System As an Example},
 booktitle = {Proceedings of the 27th Symposium on the Implementation and Application of Functional Programming Languages},
 series = {IFL '15},
 year = {2015},
 isbn = {978-1-4503-4273-5},
 location = {Koblenz, Germany},
 pages = {11:1--11:13},
 articleno = {11},
 numpages = {13},
 doi = {10.1145/2897336.2897348},
 acmid = {2897348},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {eigensystem, foreign function interface, fortran, functional programming, high-performance computing, single-assignment C},
 summary = {This paper presents the support for calling SaC from Fortran and it reports about the experiences in the context of the BGS Geomagnetic Field Modelling System code.},
 abstract  = {This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.},
 category = {core,ffi,apps,par},
}

This paper presents an application case study of the British Geological Survey's (BGS) Geomagnetic Field Modelling System code. The program consists of roughly 20 000 lines of highly-tuned Fortran MPI code that has a runtime of about 12 hours for a signal execution cycle on a cluster utilising approximately 100 CPU cores. The program contains a sequential bottleneck that executes on a single node of the cluster and takes up to 50% of the overall runtime. We describe an experiment in which we rewrote the bottleneck Fortran code in SaC, to make use of auto-parallelisation provided by the SaC compiler. The paper also presents an implementation of a foreign-function interface, to link the SaC kernel with the Fortran application. Our initial performance measurements compare the SaC kernel performance with the Fortran bottleneck code; we also present results using an OpenMP Fortran implementation. Our figures show that the SaC-based implementation achieves roughly a 12.5% runtime improvement, and outperforms the OpenMP implementation.

2012

Combining High Productivity and High Performance in Image Processing Using Single Assignment C on Multi-core CPUs and Many-core GPUs V. Wieser, C. Grelck, P. Haslinger, J. Guo, F. Korzeniowski, R. Bernecky, B. Moser and S.B. Scholz (2012). Journal of Electronic Imaging 21 (2) DOIwiesgrelhasl_jei12.pdf BibTexAbstract

@article{WiesGrelHaslJEI12,
  author = {V. Wieser and C. Grelck and P. Haslinger and J. Guo and
               F. Korzeniowski and R. Bernecky and B. Moser and S.B. Scholz},
  title = {Combining High Productivity and High Performance in Image
               Processing Using {Single Assignment C} on Multi-core {CPUs}
               and Many-core {GPUs}},
  journal = {Journal of Electronic Imaging},
  year = {2012},
  note = {},
  volume = {21},
  number = {2},
  pages = {},
  abstract = {In this paper the challenge of parallelization development of industrial high performance inspection systems is addressed concerning a conventional parallelization approach versus an auto-parallelized technique. Therefore, we introduce the functional array processing language Single Assignment C (SaC), which relies on a hardware virtualization concept for automated, parallel machine code generation for multicore CPUs and GPUs. Addi- tional, software engineering aspects like programmability, productivity, understandability, maintainability and resulting achieved gain in performance are discussed from the point of view of a developer. With several illustrative benchmarking examples from the field of image processing and machine learning, the relationship between runtime performance and efficiency of development is analyzed.},
  topics = {SAC},
  doi = {10.1117/1.JEI.21.2.021116},
  url = {wiesgrelhasl_jei12.pdf},
  category = {apps}
}

In this paper the challenge of parallelization development of industrial high performance inspection systems is addressed concerning a conventional parallelization approach versus an auto-parallelized technique. Therefore, we introduce the functional array processing language Single Assignment C (SaC), which relies on a hardware virtualization concept for automated, parallel machine code generation for multicore CPUs and GPUs. Addi- tional, software engineering aspects like programmability, productivity, understandability, maintainability and resulting achieved gain in performance are discussed from the point of view of a developer. With several illustrative benchmarking examples from the field of image processing and machine learning, the relationship between runtime performance and efficiency of development is analyzed.

2011

Combining High Productivity and High Performance in Image Processing Using Single Assignment C Volkmar Wieser, Bernhard Moser, Sven-Bodo Scholz, Stephan Herhut and Jing Guo (2011). In 10th International Conference on Quality Control by Artificial Vision (QCAV'11), Saint Etienne, France. DOI2011_3.pdf BibTexAbstract

@InProceedings{WiesMoseSchoetalQCAV11,
  author    = {Volkmar Wieser and Bernhard Moser and Sven-Bodo Scholz and Stephan Herhut and Jing Guo},
  title     = {Combining High Productivity and High Performance in Image Processing Using Single Assignment C},
  booktitle = {10th International Conference on Quality Control by Artificial Vision (QCAV'11), Saint Etienne, France},
  year      = {2011},
  affil     = {ctca},
  summary   = {},
  abstract  = {In this paper the problem of high performance software engineering is addressed in the context of image processing regarding productivity and optimized exploitation of hardware resources. Therefore, we introduce the functional array processing language Single Assignment C (SaC), which relies on a hardware virtualization concept for automated, parallel machine code generation. An illustrative benchmarking example proves both utility and adequacy of SaC for image processing.},
  category  = {apps},
  DOI       = {10.1117/12.890920},
  URL       = {2011_3.pdf},
}

In this paper the problem of high performance software engineering is addressed in the context of image processing regarding productivity and optimized exploitation of hardware resources. Therefore, we introduce the functional array processing language Single Assignment C (SaC), which relies on a hardware virtualization concept for automated, parallel machine code generation. An illustrative benchmarking example proves both utility and adequacy of SaC for image processing.

Harnessing the Power of Gpus without Losing Abstractions in Sac and Arrayol: A Comparative Study Jing Guo, Antonio Wendell O Rodrigues, Jeyarajan Thiyagalingam, Frederic Guyomarc'h, Pierre Boulet and Sven-Bodo Scholz (2011). In 16th International Workshop on High-Level Parallel Programming Models and Supportive Environments (HIPS'11), Anchorage, Alaska, USA. IEEE Xplore. DOI2011_4.pdf BibTexAbstract

@InProceedings{GuoWendJeyaetalHIPS11,
  author    = {Jing Guo and Antonio Wendell O Rodrigues and Jeyarajan Thiyagalingam and Frederic Guyomarc'h and Pierre Boulet and Sven-Bodo Scholz},
  title     = {Harnessing the Power of Gpus without Losing Abstractions in Sac and Arrayol: A Comparative Study},
  booktitle = {16th International Workshop on High-Level Parallel Programming Models and Supportive Environments (HIPS'11), Anchorage, Alaska, USA},
  year      = {2011},
  editor    = {Thorsten Hoefler},
  publisher = {IEEE Xplore},
  abstract  = {Over recent years, using Graphics Processing Units (GPUs) has become as an effective method for increasing the performance of many applications. However, these performance benefits from GPUs come at a price. Firstly extensive programming expertise and intimate knowledge of the underlying hardware are essential for gaining good speedups. Secondly, the expressibility of GPU-based programs are not powerful enough to retain the high-level abstractions of the solutions. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices while still retaining the programming abstractions. To this end, performing a source-to-source transformation, whereby a high-level language is mapped to CUDA or OpenCL, is an attractive option. In particular, it enables one to retain high-level abstractions and to harness the power of GPUs without any expertise on the GPGPU programming. In this paper, we compare and analyse two such schemes. One of them is a transformation mechanism for mapping a image/signal processing domain-specific language, ArrayOL, to OpenCL. The other one is a transformation route for mapping a high-level general purpose array processing language, Single Assignment C (SaC) to CUDA. Using a real-world image processing application as a running example, we demonstrate that albeit the fact of being general purpose, the array processing language be used to specify complex array access patterns generically. Performance of the generated CUDA code is comparable to the OpenCL code created from domain-specific language.},
  affil     = {ctca},
  category  = {apps,design},
  doi       = {10.1109/IPDPS.2011.276},
  topics    = {SAC},
  url       = {2011_4.pdf},
}

Over recent years, using Graphics Processing Units (GPUs) has become as an effective method for increasing the performance of many applications. However, these performance benefits from GPUs come at a price. Firstly extensive programming expertise and intimate knowledge of the underlying hardware are essential for gaining good speedups. Secondly, the expressibility of GPU-based programs are not powerful enough to retain the high-level abstractions of the solutions. Although the programming experience has been significantly improved by existing frameworks like CUDA and OpenCL, it is still a challenge to effectively utilise these devices while still retaining the programming abstractions. To this end, performing a source-to-source transformation, whereby a high-level language is mapped to CUDA or OpenCL, is an attractive option. In particular, it enables one to retain high-level abstractions and to harness the power of GPUs without any expertise on the GPGPU programming. In this paper, we compare and analyse two such schemes. One of them is a transformation mechanism for mapping a image/signal processing domain-specific language, ArrayOL, to OpenCL. The other one is a transformation route for mapping a high-level general purpose array processing language, Single Assignment C (SaC) to CUDA. Using a real-world image processing application as a running example, we demonstrate that albeit the fact of being general purpose, the array processing language be used to specify complex array access patterns generically. Performance of the generated CUDA code is comparable to the OpenCL code created from domain-specific language.

2009

Numerical Simulations of Unsteady Shock Wave Interactions Using Sac and Fortran-90 Alexei Kudryavtsev, Daniel Rolls, Sven-Bodo Scholz and Alex Shafarenko (2009). In 10th International Conference on Parallel Computing Technologies (PaCT'09). pp. 445–456. Springer. 2009_3.pdf BibTexAbstract

@InProceedings{RollSchoKudretalPACT09,
  author     = {Alexei Kudryavtsev and Daniel Rolls and Sven-Bodo Scholz and Alex Shafarenko},
  title      = {Numerical Simulations of Unsteady Shock Wave Interactions Using Sac and Fortran-90},
  booktitle  = {10th International Conference on Parallel Computing Technologies (PaCT'09)},
  year       = {2009},
  volume     = {5083},
  series     = {Lecture Notes in Computer Science},
  pages      = {445--456},
  publisher  = {Springer},
  abstract   = {This paper briefly introduces SaC: a data-parallel language with an imperative feel but side-effect free and declarative. The experiences of porting a simulation of unsteady shock waves in the Euler system from Fortran to SaC are reported. Both the SaC and Fortran code was run on a 16-core AMD machine. We demonstrate scalability and performance of our approach by comparison to Fortran.},
  affil      = {ctca},
  category   = {apps},
  pubaddress = {Berlin, Heidelberg, Germany},
  topics     = {SAC},
  url        = {2009_3.pdf},
}

This paper briefly introduces SaC: a data-parallel language with an imperative feel but side-effect free and declarative. The experiences of porting a simulation of unsteady shock waves in the Euler system from Fortran to SaC are reported. Both the SaC and Fortran code was run on a 16-core AMD machine. We demonstrate scalability and performance of our approach by comparison to Fortran.

2000

DOIhpf-vs-sac-muenchen-00.pdf BibTexAbstract

@InProceedings{GrelSchoEUROPAR00,
  author     = {Clemens Grelck and Sven-Bodo Scholz},
  title      = {Hpf Vs. Sac --- a Case Study},
  booktitle  = {Euro-Par 2000 Parallel Processing, 6th International Euro-Par Conference (Euro-Par'00), Munich, Germany},
  year       = {2000},
  editor     = {Arndt Bode and Thomas Ludwig and Wolfgang Karl and Roland Wismüller},
  volume     = {1900},
  series     = {Lecture Notes in Computer Science},
  pages      = {620--624},
  publisher  = {Springer},
  abstract   = {This paper compares the functional programming language Sac to Hpf with respect to specificational elegance and runtime performance. A well-known benchmark, red-black SOR, serves as a case study. After presenting the Hpf reference implementation alternative Sac implementations are discussed. Eventually, performance figures show the ability to compile highly generic Sac specifications into machine code that outperforms the Hpf implementation on a shared memory multi-processor by a factor of about 3.},
  category   = {core,apps,par},
  doi        = {10.1007/3-540-44520-X_87},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper compares SAC to HPF in terms of expressiveness and execution runtimes. Red/black successive over-relaxation serves as a case study. Several SAC specifications are presented and compared with a HPF reference implementation. Performance figures for up to 10 processors of a shared memory multiprocessor are presented.},
  topics     = {SAC,HPF},
  url        = {hpf-vs-sac-muenchen-00.pdf},
}

This paper compares the functional programming language Sac to Hpf with respect to specificational elegance and runtime performance. A well-known benchmark, red-black SOR, serves as a case study. After presenting the Hpf reference implementation alternative Sac implementations are discussed. Eventually, performance figures show the ability to compile highly generic Sac specifications into machine code that outperforms the Hpf implementation on a shared memory multi-processor by a factor of about 3.

1999

DOIsac-accel-apl-scranton-99.pdf BibTexAbstract

@InProceedings{GrelSchoAPL99,
  author    = {Clemens Grelck and Sven-Bodo Scholz},
  title     = {Accelerating Apl Programs with Sac},
  booktitle = {International Conference on Array Processing Languages (APL'99), Scranton, Pennsylvania, USA},
  year      = {1999},
  editor    = {Olivier Lefèvre},
  volume    = {29},
  number    = {2},
  series    = {APL Quote Quad},
  pages     = {50--57},
  publisher = {ACM Press},
  abstract  = {The paper investigates, how SAC, a purely functional language based on C syntax, relates to APL in terms of expressiveness and run-time behavior. To do so, three different excerpts of real world APL programs are examined. It is shown that after defining the required APL primitives in SAC, the example programs can be re-written in SAC with an almost one-to-one correspondence. Run-time comparisons between interpreting APL programs and compiled SAC programs show that speedups due to compilation vary between 2 and 500 for three representative benchmark programs.},
  category  = {core,apps,design},
  doi       = {10.1145/379277.312719},
  summary   = {This paper compares SAC to APL in terms of expressiveness and execution runtimes. Some APL applications are compiled by hand into almost identical SAC programs and the resulting execution times are discussed.},
  topics    = {SAC,APL},
  url       = {sac-accel-apl-scranton-99.pdf},
}

The paper investigates, how SAC, a purely functional language based on C syntax, relates to APL in terms of expressiveness and run-time behavior. To do so, three different excerpts of real world APL programs are examined. It is shown that after defining the required APL primitives in SAC, the example programs can be re-written in SAC with an almost one-to-one correspondence. Run-time comparisons between interpreting APL programs and compiled SAC programs show that speedups due to compilation vary between 2 and 500 for three representative benchmark programs.

DOIwlf-exp-london-98.pdf BibTexAbstract

@InProceedings{ScholzIFL98,
  author     = {Sven-Bodo Scholz},
  title      = {A Case Study: Effects of With-loop Folding on the Nas Benchmark Mg in Sac},
  booktitle  = {Implementation of Functional Languages, 10th International Workshop (IFL'98), London, England, UK, Selected Papers},
  year       = {1999},
  editor     = {Kevin Hammond and Tony Davie and Chris Clack},
  volume     = {1595},
  series     = {Lecture Notes in Computer Science},
  pages      = {216--228},
  publisher  = {Springer},
  abstract   = {Sac is a functional C variant with efficient support for high-level array operations. This paper investigates the applicability of a Sac specific optimization technique called WITH-loop-folding to real world applications. As an example program which originates from the Numerical Aerodynamic Simulation (NAS) Program developed at NASA Ames Research Center, the so-called NAS benchmark MG is chosen. It comprises a kernel from the NAS Program which implements 3-dimensional multigrid relaxation. Several run-time measurements exploit two different benefits of withloop-folding: First, an overall speed-up of about 20% can be observed. Second, a comparison between the run-times of a hand-optimized specification and of APL-like specifications yields identical run-times, although a naive compilation that does not apply With-loop-folding leads to slowdowns of more than an order of magnitude. Furthermore, With-loopfolding makes a slight variation of the algorithm feasible which substantially simplifies the program specification and requires less memory during execution. Finally, the optimized run-times are compared against run-times gained from the original Fortran program, which shows that for different problem sizes, the code generated from the Sac program does not only reach the execution times of the code generated from the Fortran program but even outperforms them by about 10%.},
  category   = {apps,core,opt},
  doi        = {10.1007/3-540-48515-5_14},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper investigates the impact of high-level program transformations, WITH-loop folding in particular, on the style of program specifications and the runtime performance of programs.},
  topics     = {SAC,Avoiding 1Temporaries,NAS},
  url        = {wlf-exp-london-98.pdf},
}

Sac is a functional C variant with efficient support for high-level array operations. This paper investigates the applicability of a Sac specific optimization technique called WITH-loop-folding to real world applications. As an example program which originates from the Numerical Aerodynamic Simulation (NAS) Program developed at NASA Ames Research Center, the so-called NAS benchmark MG is chosen. It comprises a kernel from the NAS Program which implements 3-dimensional multigrid relaxation. Several run-time measurements exploit two different benefits of withloop-folding: First, an overall speed-up of about 20% can be observed. Second, a comparison between the run-times of a hand-optimized specification and of APL-like specifications yields identical run-times, although a naive compilation that does not apply With-loop-folding leads to slowdowns of more than an order of magnitude. Furthermore, With-loopfolding makes a slight variation of the algorithm feasible which substantially simplifies the program specification and requires less memory during execution. Finally, the optimized run-times are compared against run-times gained from the original Fortran program, which shows that for different problem sizes, the code generated from the Sac program does not only reach the execution times of the code generated from the Fortran program but even outperforms them by about 10%.

1997

An Overview of Sc Sac – a Functional Language for Numerical Applications Sven-Bodo Scholz (1997). In Programming Languages and Fundamentals of Programming, Technical Report 9717. Institut für Informatik und Praktische Mathe\-matik, Universität Kiel. BibTex

@InProceedings{ScholzKPS97,
  author    = {Sven-Bodo Scholz},
  title     = {An Overview of Sc Sac -- a Functional Language for Numerical Applications},
  booktitle = {Programming Languages and Fundamentals of Programming, Technical Report 9717},
  year      = {1997},
  editor    = {R. Berghammer and F. Simon},
  publisher = {{Institut f{\"u}r Informatik und Praktische Mathe\-matik, Universit{\"a}t Kiel}},
  category  = {core,design,apps},
  topics    = {SAC,on SAC},
}

scientific-applications-sac-bonn-96.pdf BibTexAbstract

@InProceedings{ScholzIFL96,
  author     = {Sven-Bodo Scholz},
  title      = {On Programming Scientific Applications in Sac - a Functional Language Extended by a Subsystem for High-level Array Operations},
  booktitle  = {Implementation of Functional Languages, 8th International Workshop (IFL'96), Bonn, Germany, Selected Papers},
  year       = {1997},
  editor     = {Werner Kluge},
  volume     = {1268},
  series     = {Lecture Notes in Computer Science},
  pages      = {85--104},
  publisher  = {Springer-Verlag},
  abstract   = {This paper discusses some of the pros andc ons of extending a simple functional language called Sac (for Single-Assignment C) by array operations similar to those that are available in APL. The array operations in Sac are based on the ψ-calculus, an algebra of arrays which provides a formalism for specifying and simplifying array operations in terms of index set manipulations. The programming techniques made possible by Sac are demonstrated by means of a functional program for the approximation of numerical solutions of partial differential equations by multigrid relaxation. This application is notonly of practical relevance but also fully exposes the flavors of using high-level array operations. In contrast to specifcations in other languages, e.g. in Fortran or Sisal, the Sac program is well structured, reasonably concise, and - what is most important - invariant against dimensionalities and shapes. However, sophisticated compilation techniques are necessary to avoid - whenever possible - the creation of temporary arrays and to eliminate redundant operations. The paper also includes performance figures for a Sac implementation of the NAS-mgrid-benchmark which are competetive with those of a Sisal implementation.},
  category   = {apps,core,opt},
  pubaddress = {Berlin, Heidelberg, Germany},
  summary    = {This paper describes the high-level programming style intended by SAC. Multi-dimensional multigrid relaxation is investigated as a case study. The runtime performance of the SAC implementation is compared to equivalent Sisal and Fortran specifications both in terms of execution time and memory consumption.},
  topics     = {SAC,Scientific Computation},
  url        = {scientific-applications-sac-bonn-96.pdf},
}

This paper discusses some of the pros andc ons of extending a simple functional language called Sac (for Single-Assignment C) by array operations similar to those that are available in APL. The array operations in Sac are based on the ψ-calculus, an algebra of arrays which provides a formalism for specifying and simplifying array operations in terms of index set manipulations. The programming techniques made possible by Sac are demonstrated by means of a functional program for the approximation of numerical solutions of partial differential equations by multigrid relaxation. This application is notonly of practical relevance but also fully exposes the flavors of using high-level array operations. In contrast to specifcations in other languages, e.g. in Fortran or Sisal, the Sac program is well structured, reasonably concise, and - what is most important - invariant against dimensionalities and shapes. However, sophisticated compilation techniques are necessary to avoid - whenever possible - the creation of temporary arrays and to eliminate redundant operations. The paper also includes performance figures for a Sac implementation of the NAS-mgrid-benchmark which are competetive with those of a Sisal implementation.

1996